library(gbm)
library(foreign)
library(nlme)
library(ROCR)
library(dummies)
library(dplyr)
library(haven)

#load response propensity dataset prepared in stata
mydata <- read_dta("data_readyforRP.dta")

#randomly sort observations for train / Test split
mydata <- mydata[sample(1:nrow(mydata)), ]
id <- mydata$z000001a
  
#Some data prep and cleaning
mydata <- mydata %>% 
  select(-c(a12a003b, z000001a, z000006a))
  
  
table(mydata$participate)
train <- mydata[1:round(0.75*nrow(mydata)),]
test <- mydata %>% 
    anti_join(train)

  ## Boosting Model 
  GMOD1<- gbm(participate ~ . ,
              data = train,
              n.trees = 60000,
              interaction.depth = 4,
              shrinkage = 0.0001,
              distribution = "bernoulli",
              n.minobsinnode =  5,
              bag.fraction = 0.5,
              cv.folds = 4,
             #train.fraction = .7,
              verbose = TRUE)
  

summary(GMOD1)

bestTreeForPrediction <- gbm.perf(GMOD1)
bestTreeForPrediction

#get response propensity scores at tree with best performance
PRED1 <-
  predict.gbm(object = GMOD1,
              newdata = test,
              n.trees = bestTreeForPrediction,
              type = "response")

summary(PRED1)

#predicted scores vs observed scores
PRED2 <- prediction(PRED1, test$participate)

#Specificity vs sensitivity
ROC1 <- performance(PRED2, "tpr", "fpr")
plot(ROC1)
abline(a= 0, b=1)

# Function to determine optimal prob cutpoint using Youden's j
opt.cut = function(perf, pred){
  cut.ind = mapply(FUN=function(x, y, p){
    d = (x - 0)^2 + (y-1)^2
    ind = which(d == min(d))
    c(sensitivity = y[[ind]], specificity = 1-x[[ind]], 
      cutoff = p[[ind]])
  }, perf@x.values, perf@y.values, pred@cutoffs)
}
OPTC <- opt.cut(ROC1, PRED2)
OPTC

yhat.pred<- rep(0,length(PRED1))
yhat.pred[PRED1 > OPTC[3,1]] <- 1
table(yhat.pred , test$participate)
mean(yhat.pred == test$participate)


#AUC
AUC1 <- performance(PRED2, measure = "auc")
AUC1

#R_sq as in McCaffrey et al 2004
# r2 <- with(GMOD1, (train.error[1]-train.error[bestTreeForPrediction])/train.error[1])
# r2

#save response propensity scores with ID for export to stata --- all observations
PREDALL <-
  predict.gbm(object = GMOD1,
              newdata = mydata,
              n.trees = bestTreeForPrediction,
              type = "response")

pscores <- data.frame(PREDALL)
summary(pscores$PREDALL)

plot(density((pscores$PREDALL)[which(mydata$participate==1)], from = 0, to = 1 ))
lines(density((pscores$PREDALL)[which(mydata$participate==0)], from = 0, to = 1 ))

summary(pscores$PREDALL[which(mydata$participate==1)])
summary(pscores$PREDALL[which(mydata$participate==0)])

pscores$id <- id    

write.dta(pscores, file ="pscores.dta") 