##################
# Tree-based Machine Learning Methods for Survey Research
# Christoph Kern, Thomas Klausch, Frauke Kreuter
# GSOEP Example
# R 3.3.2
##################

## 01: Setup

# options(java.parameters = "-Xmx12g")

library(caret)
library(FNN)
library(DMwR)
library(party)
library(partykit)
library(strucchange)
library(coefplot)
library(randomForest)
library(xgboost)
library(bartMachine)
library(pdp)
library(PRROC)
library(latticeExtra)
library(xtable)

# Info on data access: https://www.diw.de/en/soep
# setwd("path")
load(".\\soep1314_prep.RData")

######################### Split and pre-process data #########################

## 02: Training and test set

soep1314c <- subset(soep1314, !is.na(D_response3))

set.seed(133)
inTrain <- createDataPartition(soep1314c$D_response3, 
                               p = .8, 
                               list = FALSE, 
                               times = 1)

soep.train <- soep1314c[inTrain,]
soep.test <- soep1314c[-inTrain,]

soep.train.c <- subset(soep.train, complete.cases(soep.train[,c(118,119,120,123,125,126,130,69,107,110,111,112,113,131,132,133,134,135,137,139,140,142,143,144,145,147,148,159,161,164)]))
soep.test.c <- subset(soep.test, complete.cases(soep.test[,c(118,119,120,123,125,126,130,69,107,110,111,112,113,131,132,133,134,135,137,139,140,142,143,144,145,147,148,159,161,164)]))

## Tuning Setup

cvIndex <- createFolds(soep.train.c$D_response3, 10, returnTrain = T)
fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...))

ctrl0 <- trainControl(method = "none",
                      classProbs = TRUE)

ctrl3  <- trainControl(method = "cv",
                       number = 10,
                       index = cvIndex,
                       summaryFunction = fiveStats,
                       classProbs = TRUE)

ctrl5  <- trainControl(method = "cv",
                       number = 10,
                       index = cvIndex,
                       summaryFunction = fiveStats,
                       classProbs = TRUE,
                       returnData = FALSE)

ctrl6  <- trainControl(method = "cv",
                       number = 10,
                       index = cvIndex,
                       summaryFunction = fiveStats,
                       classProbs = TRUE,
                       selectionFunction = "bestmin")

## Compute imputed HH-Income
# Prepare data

X_train <- soep.train.c[soep.train.c$hinc_m != 0,]
X_train_d <- dummyVars(hinc_m ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area, 
                       data = X_train, fullRank = TRUE)
X_train <- scale(predict(X_train_d, X_train))

X_test1 <- soep.train.c[soep.train.c$hinc_m == 0,]
X_test1_d <- dummyVars(hinc_m ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area, 
                       data = X_test1, fullRank = TRUE)
X_test1 <- scale(predict(X_test1_d, X_test1))

X_test2 <- soep.test.c[soep.test.c$hinc_m == 0,]
X_test2_d <- dummyVars(hinc_m ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area, 
                       data = X_test2, fullRank = TRUE)
X_test2 <- scale(predict(X_test2_d, X_test2))

y_train <- soep.train.c[soep.train.c$hinc_m != 0,140]

# Tune k

r2 <- NULL
for(i in 1:5*5) {
  pred <- knn.reg(train = X_train, y = y_train, k = i)
  r2[i] = pred$R2Pred
}
r2

# Impute in train and test set

y_knn1 <- knn.reg(train = X_train, test = X_test1, y = y_train, k = 5)
y_knn2 <- knn.reg(train = X_train, test = X_test2, y = y_train, k = 5)

soep.train.c$hinc_mi <- soep.train.c$hinc_m
soep.train.c$hinc_mi[soep.train.c$hinc_m == 0] <- y_knn1$pred

soep.test.c$hinc_mi <- soep.test.c$hinc_m
soep.test.c$hinc_mi[soep.test.c$hinc_m == 0] <- y_knn2$pred

######################### Build ML models #########################

## 03: Logistic Regression

logit4 <- train(D_response3 ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_mi + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib,
                data = soep.train.c,
                method = "glm",
                family = "binomial",
                trControl = ctrl0)
summary(logit4)

## 04: Model-based Recursive Partitioning
# Smaller MOB (section 4.1)

glmtree0 <- glmtree(D_response3 ~ age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_mi + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area | 
                    y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample, 
                    data = soep.train.c,
                    family = binomial,
                    alpha = 0.05, 
                    minsplit = 1500,
                    maxdepth = 3,
                    prune = "AIC",
                    verbose = T,
                    cluster = intid13)
sctest(glmtree0)
coef(glmtree0)
summary(glmtree0)

names <- c("G_areaApt. 9+ Unit" = "House: Apt. 9+ Unit", "G_areaApt. 3-8 Unit" = "House: Apt. 3-8 Unit", "G_area1-2 Fam. House" = "House: 1-2 Fam. House", D_urbanurban = "Area: Urban", D_eastEast = "Region: East", swb = "SWB", health = "Health", hhgr = "HH-Size", G_famstdwidowed = "Marital: widowed", 
           G_famstddivorced = "Marital: divorced", G_famstdunmarried = "Marital: unmarried", "G_famstdmarried, separate" = "Marital: married, sep.", G_typ1hh2Other = "HH type: Other", "G_typ1hh2Couple w child LE+GT 16" = "HH type: ch. < & > 16", "G_typ1hh2Couple w child GT 16" = "HH type: ch. > 16", 
           "G_typ1hh2Couple w child LE 16" = "HH type: ch. < 16", "G_typ1hh2Single Parent" = "HH type: Single P.", "G_typ1hh2Couple w/o child" = "HH type: Couple w/o ch.", D_ownerOwner = "Home: Owner", hinc_mi = "HH-Inc.", "D_hincmisNon-Response HH-Inc." = "HH-Inc.: Missing", "G_empl2Not Employed" = "Empl.: Not empl.", 
           "G_empl2Marginaly Emp." = "Empl.: Marginal", G_empl2Training = "Empl.: in Training", "G_empl2Part-Time Emp." = "Empl.: Part-Time", "G_migbackindirect mig. back." = "Migration: indirect", "G_migbackdirect mig. back." = "Migration: direct", D_genderfemale = "Gender: Female", education = "Education", age = "Age")

pdf("g2_glmtree0_1-2.pdf", width = 7.5, height = 7)
coefplot(glmtree0[2], title = "", xlab = "Coef.", ylab = "", col = "black", intercept = F, decreasing = T, newNames = names, lwdOuter = 0.5) + 
  theme(text = element_text(size = 15)) 
dev.off()
pdf("g2_glmtree0_2-2.pdf", width = 7.5, height = 7)
coefplot(glmtree0[4], title = "", xlab = "Coef.", ylab = "", col = "black", intercept = F, decreasing = T, newNames = names, lwdOuter = 0.5) + 
  theme(text = element_text(size = 15)) 
dev.off()
pdf("g2_glmtree0_3-2.pdf", width = 7.5, height = 7)
coefplot(glmtree0[5], title = "", xlab = "Coef.", ylab = "", col = "black", intercept = F, decreasing = T, newNames = names, lwdOuter = 0.5) + 
  theme(text = element_text(size = 15)) 
dev.off()

# Bigger MOB (section 4.2)

glmtree4 <- glmtree(D_response3 ~ age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_mi + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib | 
                    y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample, 
                    data = soep.train.c,
                    family = binomial,
                    alpha = 0.05,
                    minsplit = 1000,
                    maxdepth = 4,
                    verbose = T,
                    prune = "AIC")
sctest(glmtree4)
coef(glmtree4)
summary(glmtree4)

## 05: Conditional Inference Trees
# Smaller CTREE (section 4.1)

soep.train.c2 <- soep.train.c
levels(soep.train.c2$D_resp_m2) <- c("New HH", "Interview", "Nonresponse")
levels(soep.train.c2$G_typ1hh2) <- c("1-Pers.", "Couple w/o ch.", "\nSingle P.", "ch. < 16", "ch. > 16", "ch. <> 16", "Other")
names(soep.train.c2)[names(soep.train.c2)=="G_psample"] <- "SOEP_sample"
names(soep.train.c2)[names(soep.train.c2)=="G_typ1hh2"] <- "HH_type"
names(soep.train.c2)[names(soep.train.c2)=="y_soep"] <- "SOEP_years"
names(soep.train.c2)[names(soep.train.c2)=="G_contacts"] <- "Contacts"
names(soep.train.c2)[names(soep.train.c2)=="i_age"] <- "Int_age"
names(soep.train.c2)[names(soep.train.c2)=="D_resp_m2"] <- "Resp_2012"
names(soep.train.c2)[names(soep.train.c2)=="i_lengthint"] <- "Int_lenght"
names(soep.train.c2)[names(soep.train.c2)=="D_hincmis"] <- "HH_Inc_missing"

ct0 <- ctree(D_response3 ~ SOEP_years + Contacts + G_mode2 + D_partref + D_contact + Resp_2012 + missratio + i_gender + Int_age + i_experience + i_responserate + Int_lenght + SOEP_sample + age + education + D_gender + G_migback + G_empl2 + HH_Inc_missing + hinc_m + D_owner + HH_type + G_famstd + hhgr + health + swb + D_east + D_urban + G_area, 
             data = soep.train.c2,
             mincriterion = 0.999, 
             minbucket = 20, 
             maxdepth = 4)
ct0

pdf("g2_ctree0.pdf", width = 11.25, height = 7.5)
plot(ct0, gp = gpar(fontsize = 8.25), ip_args = list(id=F), ep_args = list(justmin=2), tp_args = list(mainlab=""))
dev.off()

# Bigger CTREE (section 4.2)

grid <- expand.grid(mincriterion = c(0.99, 0.95, 0.90, 0.85, 0.75))

set.seed(533)
ct4 <- train(D_response3 ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_m + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib,
             data = soep.train.c,
             method = "ctree",
             trControl = ctrl3,
             tuneGrid = grid,
             metric = "ROC")

ct4
plot(ct4)
confusionMatrix(ct4)

## 06: Random Forest

grid <- expand.grid(mtry = 1:12*4)

set.seed(533)
rf4 <- train(D_response3 ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_m + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib,
             data = soep.train.c,
             method = "rf",
             trControl = ctrl3,
             tuneGrid = grid,
             metric = "ROC")

rf4
plot(rf4)

## 07: Gradient Boosting

grid <- expand.grid(max_depth = c(7, 8, 9, 10),
                    nrounds = c(1000, 1500, 2000, 2500),
                    eta = c(0.01, 0.05),
                    min_child_weight = 5,
                    subsample = 0.7,
                    gamma = 0,
                    colsample_bytree = 1)

set.seed(533)
xgb4 <- train(D_response3 ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_m + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib,
              data = soep.train.c,
              method = "xgbTree",
              trControl = ctrl5,
              tuneGrid = grid,
              metric = "ROC")

xgb4
plot(xgb4)
confusionMatrix(xgb4)

## 08: BART

bestmin <- function (x, metric, maximize) # caret mixed up levels
{
  bestIter <- if (maximize) 
    which.min(x[, metric])
  else which.max(x[, metric])
  bestIter
}

grid <- expand.grid(num_trees = c(50, 200),
                    k = c(1, 2, 3, 5),
                    alpha = 0.95,
                    beta = 2,
                    nu = 3)

set.seed(533)
bart4 <- train(D_response3 ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_m + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib,
               data = soep.train.c,
               method = "bartMachine",
               trControl = ctrl6,
               tuneGrid = grid,
               num_burn_in = 500,
               num_iterations_after_burn_in = 2000,
               mem_cache_for_speed = FALSE,
               serialize = TRUE,
               metric = "ROC")

bart4
plot(bart4)
confusionMatrix(bart4)
plot_convergence_diagnostics(bart4$finalModel)

######################### Evaluate and compare #########################

## 09: Variable Importances and PDPs

logitimp4 <- varImp(logit4)
ct4imp4 <- varImp(ct4)
rfimp4 <- varImp(rf4)
xgbimp4 <- varImp(xgb4)
bartimp4 <- varImp(bart4)

# g2_varImp1.pdf
trellis.device(color = F)
logitimp4
plot(logitimp4, top = 20, scales = list(y = list(labels = c("HH type: Single P.","Missing ratio","HH type: Other","Marital: married, sep.","Int.: Age","Empl.: Part-Time","Region: East","Migration: indirect","HH type: ch. < & > 16","Contact: mail & phone",
                                                            "Migration: direct","Contact: mail | phone","Int.: Response rate","Empl.: Not empl.","Contacts: 4-6","Sample: L","Education","SOEP years","HH-Inc.: Missing","Contacts: 7+"),
                                                 cex=1.1)))

# g2_varImp2.pdf
trellis.device(color = F)
ct4imp4
plot(ct4imp4, top = 20, scales = list(y = list(labels = c("Int.: Age","Area: Urban","Contact: mail, phone","Home: Owner","Region: East","Health","Inv. Staying Prob.","Empl.","Education","Int.: Response rate","Contacts: 4-6, 7+","2012: Interview, Nonresp.",
                                                          "Mode","Migration: direct, indirect","HH type","HH size","Int.: Experience","Age","SOEP years","Sample"),
                                               cex=1.1)))

# g2_varImp3.pdf
trellis.device(color = F)
rfimp4
plot(rfimp4, top = 20, scales = list(y = list(labels = c("House: 1-2 Fam. House","House: Apt. 3-8 Unit","Area: Urban","Home: Owner","Int.: Female","Gender: Female","Contacts: 4-6","SOEP years","Health","HH size","SWB","Missing ratio","Int.: Experience","Education",
                                                         "Inv. Staying Prob.","Int.: Age","Int.: Response rate","Age","Int.: Length","HH-Inc."),
                                              cex=1.1)))

# g2_varImp4.pdf
trellis.device(color = F)
xgbimp4
plot(xgbimp4, top = 20, scales = list(y = list(labels = c("Sample: L","House: Apt. 3-8 Unit","House: 1-2 Fam. House","Home: Owner","Contacts: 4-6","Int.: Female","Sample: H-K","Health","HH size","SWB","Education","SOEP years","Int.: Experience","Missing ratio",
                                                          "Inv. Staying Prob.","Int.: Age","Age","Int.: Response rate","Int.: Length","HH-Inc."),
                                               cex=1.1)))

# g2_varImp5.pdf
trellis.device(color = F)
bartimp4
plot(bartimp4, top = 20, scales = list(y = list(labels = c("Health","HH type: Other","House: 1-2 Fam. House","Refusal in HH","Empl.: Not empl.","Marital: unmarried","Age","Int.: Age","Sample: L","Region: East",
                                                           "Migration: direct","Education","Contacts: 7+","Int.: Length","HH-Inc.: Missing","Contacts: 4-6","Missing ratio","Int.: Response rate","Marital: married, sep.","SOEP years"),
                                                cex=1.1)))

colfunc <- colorRampPalette(c("white", "gray15"))
cols <- colfunc(100)

# g2_pdp1.pdf
trellis.par.set(regions=list(col=cols))
pd1 <- partial(rf4, pred.var = c("age", "i_age"), train = soep.train.c, type = "classification", which.class = 2, prob = T, trim.outliers = T, grid.resolution = 50, progress = "text")
plotPartial(pd1, levelplot = F, drape = T, colorkey = F, screen = list(z = 45, x = -60), scales = list(col="black"), par.settings = list(axis.line = list(col = "transparent")), zlab=list(label=expression(hat(y)), cex=1.15), xlab=list(label="Age", cex=1.15), ylab=list(label="Int.: Age", cex=1.15), zoom = 0.93)

# g2_pdp2.pdf
trellis.par.set(regions=list(col=cols))
pd2 <- partial(rf4, pred.var = c("education", "i_lengthint"), train = soep.train.c, type = "classification", which.class = 2, prob = T, trim.outliers = T, grid.resolution = 50, progress = "text")
plotPartial(pd2, levelplot = F, drape = T, colorkey = F, screen = list(z = 45, x = -60), scales = list(col="black"), par.settings = list(axis.line = list(col = "transparent")), zlab=list(label=expression(hat(y)), cex=1.15), xlab=list(label="Education", cex=1.15), ylab=list(label="Int.: Length", cex=1.15), zoom = 0.93)

# g2_pdp3.pdf
trellis.par.set(regions=list(col=cols))
pd3 <- partial(rf4, pred.var = c("hinc_m", "missratio"), train = soep.train.c, type = "classification", which.class = 2, prob = T, trim.outliers = T, grid.resolution = 50, progress = "text")
plotPartial(pd3, levelplot = F, drape = T, colorkey = F, screen = list(z = 45, x = -60), scales = list(col="black"), par.settings = list(axis.line = list(col = "transparent")), zlab=list(label=expression(hat(y)), cex=1.15), xlab=list(label="HH-Inc.", cex=1.15), ylab=list(label="Missing ratio", cex=1.15), zoom = 0.93)

# g2_pdp4.pdf
trellis.par.set(regions=list(col=cols))
pd4 <- partial(rf4, pred.var = c("y_soep", "i_responserate"), train = soep.train.c, type = "classification", which.class = 2, prob = T, trim.outliers = T, grid.resolution = 50, progress = "text")
plotPartial(pd4, levelplot = F, drape = T, colorkey = F, screen = list(z = -45, x = -60), scales = list(col="black"), par.settings = list(axis.line = list(col = "transparent")), zlab=list(label=expression(hat(y)), cex=1.15), xlab=list(label="SOEP years", cex=1.15), ylab=list(label="Int.: Response rate", cex=1.15), zoom = 0.93)

## 10: Prediction in test set
# Predict classes and probabilities

p.logit4 <- predict(logit4, newdata = soep.test.c)
pr.logit4 <- predict(logit4, newdata = soep.test.c, type = "prob")

pr.glmt4 <- predict(glmtree4, newdata = soep.test.c, type = "response")
p.glmt4 <- ifelse(pr.glmt4 > 0.5, "refusal", "interview")

p.ct4 <- predict(ct4, newdata = soep.test.c)
pr.ct4 <- predict(ct4, newdata = soep.test.c, type = "prob")

p.rf4 <- predict(rf4, newdata = soep.test.c)
pr.rf4 <- predict(rf4, newdata = soep.test.c, type = "prob")

p.xgb4 <- predict(xgb4, newdata = soep.test.c)
pr.xgb4 <- predict(xgb4, newdata = soep.test.c, type = "prob")

p.bart4 <- predict(bart4, newdata = soep.test.c)
pr.bart4 <- predict(bart4, newdata = soep.test.c, type = "prob")

# ROC and PR curves

fg1 <- pr.logit4$refusal[soep.test.c$D_response3 == "refusal"]
bg1 <- pr.logit4$refusal[soep.test.c$D_response3 == "interview"]
fg3 <- pr.ct4$refusal[soep.test.c$D_response3 == "refusal"]
bg3 <- pr.ct4$refusal[soep.test.c$D_response3 == "interview"]
fg4 <- pr.rf4$refusal[soep.test.c$D_response3 == "refusal"]
bg4 <- pr.rf4$refusal[soep.test.c$D_response3 == "interview"]
fg5 <- pr.xgb4$refusal[soep.test.c$D_response3 == "refusal"]
bg5 <- pr.xgb4$refusal[soep.test.c$D_response3 == "interview"]
fg6 <- pr.glmt4[soep.test.c$D_response3 == "refusal"]
bg6 <- pr.glmt4[soep.test.c$D_response3 == "interview"]
fg7 <- pr.bart4$interview[soep.test.c$D_response3 == "refusal"] # caret mixed up levels
bg7 <- pr.bart4$interview[soep.test.c$D_response3 == "interview"] # caret mixed up levels

logit4.roc <- roc.curve(scores.class0 = fg1, scores.class1 = bg1, curve = T)
ct4.roc <- roc.curve(scores.class0 = fg3, scores.class1 = bg3, curve = T)
rf4.roc <- roc.curve(scores.class0 = fg4, scores.class1 = bg4, curve = T)
xgb4.roc <- roc.curve(scores.class0 = fg5, scores.class1 = bg5, curve = T)
glmt4.roc <- roc.curve(scores.class0 = fg6, scores.class1 = bg6, curve = T)
bart4.roc <- roc.curve(scores.class0 = fg7, scores.class1 = bg7, curve = T)

p1 <- xyplot(logit4.roc$curve[,2] ~ logit4.roc$curve[,1], type = "l", lwd = 2, col = "black",
             xlab = list(label = "1 - Specificity", cex = 1.25), 
             ylab = list(label = "Sensitivity", cex = 1.25),
             key = list(columns = 2, text = list(lab = c("Logit","MOB","CTREE","RF","XGBoost","BART")), cex = 1.25,
                        lines = list(type = "l", lwd = 2, col = c("black", "gray", "#1B9E77", "#7570B3", "#E6AB02", "#D95F02"))),
             scales = list(cex = 1.25),
             panel = function(x, ...){
               panel.xyplot(x, ...)
               panel.abline(0,1)})
p2 <- xyplot(glmt4.roc$curve[,2] ~ glmt4.roc$curve[,1], type = "l", lwd = 2, col = "gray")
p3 <- xyplot(ct4.roc$curve[,2] ~ ct4.roc$curve[,1], type = "l", lwd = 2, col = "#1B9E77")
p4 <- xyplot(rf4.roc$curve[,2] ~ rf4.roc$curve[,1], type = "l", lwd = 2, col = "#7570B3")
p5 <- xyplot(xgb4.roc$curve[,2] ~ xgb4.roc$curve[,1], type = "l", lwd = 2, col = "#E6AB02")
p6 <- xyplot(bart4.roc$curve[,2] ~ bart4.roc$curve[,1], type = "l", lwd = 2, col = "#D95F02")

# ROC plot
x11(width=7, height=7)
p1 + as.layer(p2) + as.layer(p3) + as.layer(p4) + as.layer(p5) + as.layer(p6)

logit4.pr <- pr.curve(scores.class0 = fg1, scores.class1 = bg1, curve = T)
ct4.pr <- pr.curve(scores.class0 = fg3, scores.class1 = bg3, curve = T)
rf4.pr <- pr.curve(scores.class0 = fg4, scores.class1 = bg4, curve = T)
xgb4.pr <- pr.curve(scores.class0 = fg5, scores.class1 = bg5, curve = T)
glmt4.pr <- pr.curve(scores.class0 = fg6, scores.class1 = bg6, curve = T)
bart4.pr <- pr.curve(scores.class0 = fg7, scores.class1 = bg7, curve = T)

p1 <- xyplot(logit4.pr$curve[,2] ~ logit4.pr$curve[,1], type = "l", lwd = 2, col = "black",
             xlab = list(label = "Recall", cex = 1.25), 
             ylab = list(label = "Precision", cex = 1.25),
             ylim = c(-0.07,1.07),
             scales = list(cex = 1.25),
             key = list(columns = 2, 
                        text = list(lab = c("Logit","MOB","CTREE","RF","XGBoost","BART")), cex = 1.25,
                        lines = list(type = "l", lwd = 2, col = c("black", "gray", "#1B9E77", "#7570B3", "#E6AB02", "#D95F02"))))
p2 <- xyplot(glmt4.pr$curve[,2] ~ glmt4.pr$curve[,1], type = "l", lwd = 2, col = "gray")
p3 <- xyplot(ct4.pr$curve[,2] ~ ct4.pr$curve[,1], type = "l", lwd = 2, col = "#1B9E77")
p4 <- xyplot(rf4.pr$curve[,2] ~ rf4.pr$curve[,1], type = "l", lwd = 2, col = "#7570B3")
p5 <- xyplot(xgb4.pr$curve[,2] ~ xgb4.pr$curve[,1], type = "l", lwd = 2, col = "#E6AB02")
p6 <- xyplot(bart4.pr$curve[,2] ~ bart4.pr$curve[,1], type = "l", lwd = 2, col = "#D95F02")

# PR plot
x11(width=7, height=7)
p1 + as.layer(p2) + as.layer(p3) + as.layer(p4) + as.layer(p5) + as.layer(p6)

# Threshold optimization

logit4.roc <- roc(soep.test.c$D_response3, pr.logit4$refusal)
logit4.t0 <- coords(logit4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1))
logit4.t1 <- coords(logit4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1))
pt0.logit4 <- as.factor(ifelse(pr.logit4$refusal > logit4.t0[1], "refusal", "interview"))
pt1.logit4 <- as.factor(ifelse(pr.logit4$refusal > logit4.t1[1], "refusal", "interview"))

ct4.roc <- roc(soep.test.c$D_response3, pr.ct4$refusal)
ct4.t0 <- coords(ct4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1))
ct4.t1 <- coords(ct4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1))
pt0.ct4 <- as.factor(ifelse(pr.ct4$refusal > ct4.t0[1], "refusal", "interview"))
pt1.ct4 <- as.factor(ifelse(pr.ct4$refusal > ct4.t1[1], "refusal", "interview"))

rf4.roc <- roc(soep.test.c$D_response3, pr.rf4$refusal)
rf4.t0 <- coords(rf4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1))
rf4.t1 <- coords(rf4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1))
pt0.rf4 <- as.factor(ifelse(pr.rf4$refusal > rf4.t0[1], "refusal", "interview"))
pt1.rf4 <- as.factor(ifelse(pr.rf4$refusal > rf4.t1[1], "refusal", "interview"))

xgb4.roc <- roc(soep.test.c$D_response3, pr.xgb4$refusal)
xgb4.t0 <- coords(xgb4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1))
xgb4.t1 <- coords(xgb4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1))
pt0.xgb4 <- as.factor(ifelse(pr.xgb4$refusal > xgb4.t0[1], "refusal", "interview"))
pt1.xgb4 <- as.factor(ifelse(pr.xgb4$refusal > xgb4.t1[1], "refusal", "interview"))

glmt4.roc <- roc(soep.test.c$D_response3, pr.glmt4)
glmt4.t0 <- coords(glmt4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1))
glmt4.t1 <- coords(glmt4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1))
pt0.glmt4 <- as.factor(ifelse(pr.glmt4 > glmt4.t0[1], "refusal", "interview"))
pt1.glmt4 <- as.factor(ifelse(pr.glmt4 > glmt4.t1[1], "refusal", "interview"))

bart4.roc <- roc(soep.test.c$D_response3, pr.bart4$interview)
bart4.t0 <- coords(bart4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1))
bart4.t1 <- coords(bart4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1))
pt0.bart4 <- as.factor(ifelse(pr.bart4$interview > bart4.t0[1], "refusal", "interview"))
pt1.bart4 <- as.factor(ifelse(pr.bart4$interview > bart4.t1[1], "refusal", "interview"))

cm10 <- confusionMatrix(pt0.logit4, soep.test.c$D_response3, positive = "refusal", mode = "everything")
cm30 <- confusionMatrix(pt0.ct4, soep.test.c$D_response3, positive = "refusal", mode = "everything")
cm40 <- confusionMatrix(pt0.rf4, soep.test.c$D_response3, positive = "refusal", mode = "everything")
cm50 <- confusionMatrix(pt0.xgb4, soep.test.c$D_response3, positive = "refusal", mode = "everything")
cm60 <- confusionMatrix(pt0.glmt4, soep.test.c$D_response3, positive = "refusal", mode = "everything")
cm70 <- confusionMatrix(pt0.bart4, soep.test.c$D_response3, positive = "refusal", mode = "everything")

cm11 <- confusionMatrix(pt1.logit4, soep.test.c$D_response3, positive = "refusal", mode = "everything")
cm31 <- confusionMatrix(pt1.ct4, soep.test.c$D_response3, positive = "refusal", mode = "everything")
cm41 <- confusionMatrix(pt1.rf4, soep.test.c$D_response3, positive = "refusal", mode = "everything")
cm51 <- confusionMatrix(pt1.xgb4, soep.test.c$D_response3, positive = "refusal", mode = "everything")
cm61 <- confusionMatrix(pt1.glmt4, soep.test.c$D_response3, positive = "refusal", mode = "everything")
cm71 <- confusionMatrix(pt1.bart4, soep.test.c$D_response3, positive = "refusal", mode = "everything")

Logit1 <- c(logit4.roc$auc, logit4.pr$auc.integral)
ctree <- c(ct4.roc$auc, ct4.pr$auc.integral)
RF <- c(rf4.roc$auc, rf4.pr$auc.integral)
GBM <- c(xgb4.roc$auc, xgb4.pr$auc.integral)
glmtree <- c(glmt4.roc$auc, glmt4.pr$auc.integral)
bart <- c(bart4.roc$auc, bart4.pr$auc.integral)

tab <- xtable(t(rbind(Logit1, glmtree, ctree, RF, GBM, bart)), digits = 3)
print(tab, type="latex", file="2_models4_test2.tex")

Logit1 <- c(cm10$overall[1], cm10$byClass[c(1:2,5,7)], cm10$overall[2])
ctree <- c(cm30$overall[1], cm30$byClass[c(1:2,5,7)], cm30$overall[2])
RF <- c(cm40$overall[1], cm40$byClass[c(1:2,5,7)], cm40$overall[2])
GBM <- c(cm50$overall[1], cm50$byClass[c(1:2,5,7)], cm50$overall[2])
glmtree <- c(cm60$overall[1], cm60$byClass[c(1:2,5,7)], cm60$overall[2])
bart <- c(cm70$overall[1], cm70$byClass[c(1:2,5,7)], cm70$overall[2])

tab <- xtable(rbind(Logit1, glmtree, ctree, RF, GBM, bart), digits = 3)
print(tab, type="latex", file="2_models4_test20.tex")

Logit1 <- c(cm11$overall[1], cm11$byClass[c(1:2,5,7)], cm11$overall[2])
ctree <- c(cm31$overall[1], cm31$byClass[c(1:2,5,7)], cm31$overall[2])
RF <- c(cm41$overall[1], cm41$byClass[c(1:2,5,7)], cm41$overall[2])
GBM <- c(cm51$overall[1], cm51$byClass[c(1:2,5,7)], cm51$overall[2])
glmtree <- c(cm61$overall[1], cm61$byClass[c(1:2,5,7)], cm61$overall[2])
bart <- c(cm71$overall[1], cm71$byClass[c(1:2,5,7)], cm71$overall[2])

tab <- xtable(rbind(Logit1, glmtree, ctree, RF, GBM, bart), digits = 3)
print(tab, type="latex", file="2_models4_test21.tex")