################## # Tree-based Machine Learning Methods for Survey Research # Christoph Kern, Thomas Klausch, Frauke Kreuter # GSOEP Example # R 3.3.2 ################## ## 01: Setup # options(java.parameters = "-Xmx12g") library(caret) library(FNN) library(DMwR) library(party) library(partykit) library(strucchange) library(coefplot) library(randomForest) library(xgboost) library(bartMachine) library(pdp) library(PRROC) library(latticeExtra) library(xtable) # Info on data access: https://www.diw.de/en/soep # setwd("path") load(".\\soep1314_prep.RData") ######################### Split and pre-process data ######################### ## 02: Training and test set soep1314c <- subset(soep1314, !is.na(D_response3)) set.seed(133) inTrain <- createDataPartition(soep1314c$D_response3, p = .8, list = FALSE, times = 1) soep.train <- soep1314c[inTrain,] soep.test <- soep1314c[-inTrain,] soep.train.c <- subset(soep.train, complete.cases(soep.train[,c(118,119,120,123,125,126,130,69,107,110,111,112,113,131,132,133,134,135,137,139,140,142,143,144,145,147,148,159,161,164)])) soep.test.c <- subset(soep.test, complete.cases(soep.test[,c(118,119,120,123,125,126,130,69,107,110,111,112,113,131,132,133,134,135,137,139,140,142,143,144,145,147,148,159,161,164)])) ## Tuning Setup cvIndex <- createFolds(soep.train.c$D_response3, 10, returnTrain = T) fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...)) ctrl0 <- trainControl(method = "none", classProbs = TRUE) ctrl3 <- trainControl(method = "cv", number = 10, index = cvIndex, summaryFunction = fiveStats, classProbs = TRUE) ctrl5 <- trainControl(method = "cv", number = 10, index = cvIndex, summaryFunction = fiveStats, classProbs = TRUE, returnData = FALSE) ctrl6 <- trainControl(method = "cv", number = 10, index = cvIndex, summaryFunction = fiveStats, classProbs = TRUE, selectionFunction = "bestmin") ## Compute imputed HH-Income # Prepare data X_train <- soep.train.c[soep.train.c$hinc_m != 0,] X_train_d <- dummyVars(hinc_m ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area, data = X_train, fullRank = TRUE) X_train <- scale(predict(X_train_d, X_train)) X_test1 <- soep.train.c[soep.train.c$hinc_m == 0,] X_test1_d <- dummyVars(hinc_m ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area, data = X_test1, fullRank = TRUE) X_test1 <- scale(predict(X_test1_d, X_test1)) X_test2 <- soep.test.c[soep.test.c$hinc_m == 0,] X_test2_d <- dummyVars(hinc_m ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area, data = X_test2, fullRank = TRUE) X_test2 <- scale(predict(X_test2_d, X_test2)) y_train <- soep.train.c[soep.train.c$hinc_m != 0,140] # Tune k r2 <- NULL for(i in 1:5*5) { pred <- knn.reg(train = X_train, y = y_train, k = i) r2[i] = pred$R2Pred } r2 # Impute in train and test set y_knn1 <- knn.reg(train = X_train, test = X_test1, y = y_train, k = 5) y_knn2 <- knn.reg(train = X_train, test = X_test2, y = y_train, k = 5) soep.train.c$hinc_mi <- soep.train.c$hinc_m soep.train.c$hinc_mi[soep.train.c$hinc_m == 0] <- y_knn1$pred soep.test.c$hinc_mi <- soep.test.c$hinc_m soep.test.c$hinc_mi[soep.test.c$hinc_m == 0] <- y_knn2$pred ######################### Build ML models ######################### ## 03: Logistic Regression logit4 <- train(D_response3 ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_mi + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib, data = soep.train.c, method = "glm", family = "binomial", trControl = ctrl0) summary(logit4) ## 04: Model-based Recursive Partitioning # Smaller MOB (section 4.1) glmtree0 <- glmtree(D_response3 ~ age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_mi + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area | y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample, data = soep.train.c, family = binomial, alpha = 0.05, minsplit = 1500, maxdepth = 3, prune = "AIC", verbose = T, cluster = intid13) sctest(glmtree0) coef(glmtree0) summary(glmtree0) names <- c("G_areaApt. 9+ Unit" = "House: Apt. 9+ Unit", "G_areaApt. 3-8 Unit" = "House: Apt. 3-8 Unit", "G_area1-2 Fam. House" = "House: 1-2 Fam. House", D_urbanurban = "Area: Urban", D_eastEast = "Region: East", swb = "SWB", health = "Health", hhgr = "HH-Size", G_famstdwidowed = "Marital: widowed", G_famstddivorced = "Marital: divorced", G_famstdunmarried = "Marital: unmarried", "G_famstdmarried, separate" = "Marital: married, sep.", G_typ1hh2Other = "HH type: Other", "G_typ1hh2Couple w child LE+GT 16" = "HH type: ch. < & > 16", "G_typ1hh2Couple w child GT 16" = "HH type: ch. > 16", "G_typ1hh2Couple w child LE 16" = "HH type: ch. < 16", "G_typ1hh2Single Parent" = "HH type: Single P.", "G_typ1hh2Couple w/o child" = "HH type: Couple w/o ch.", D_ownerOwner = "Home: Owner", hinc_mi = "HH-Inc.", "D_hincmisNon-Response HH-Inc." = "HH-Inc.: Missing", "G_empl2Not Employed" = "Empl.: Not empl.", "G_empl2Marginaly Emp." = "Empl.: Marginal", G_empl2Training = "Empl.: in Training", "G_empl2Part-Time Emp." = "Empl.: Part-Time", "G_migbackindirect mig. back." = "Migration: indirect", "G_migbackdirect mig. back." = "Migration: direct", D_genderfemale = "Gender: Female", education = "Education", age = "Age") pdf("g2_glmtree0_1-2.pdf", width = 7.5, height = 7) coefplot(glmtree0[2], title = "", xlab = "Coef.", ylab = "", col = "black", intercept = F, decreasing = T, newNames = names, lwdOuter = 0.5) + theme(text = element_text(size = 15)) dev.off() pdf("g2_glmtree0_2-2.pdf", width = 7.5, height = 7) coefplot(glmtree0[4], title = "", xlab = "Coef.", ylab = "", col = "black", intercept = F, decreasing = T, newNames = names, lwdOuter = 0.5) + theme(text = element_text(size = 15)) dev.off() pdf("g2_glmtree0_3-2.pdf", width = 7.5, height = 7) coefplot(glmtree0[5], title = "", xlab = "Coef.", ylab = "", col = "black", intercept = F, decreasing = T, newNames = names, lwdOuter = 0.5) + theme(text = element_text(size = 15)) dev.off() # Bigger MOB (section 4.2) glmtree4 <- glmtree(D_response3 ~ age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_mi + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib | y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample, data = soep.train.c, family = binomial, alpha = 0.05, minsplit = 1000, maxdepth = 4, verbose = T, prune = "AIC") sctest(glmtree4) coef(glmtree4) summary(glmtree4) ## 05: Conditional Inference Trees # Smaller CTREE (section 4.1) soep.train.c2 <- soep.train.c levels(soep.train.c2$D_resp_m2) <- c("New HH", "Interview", "Nonresponse") levels(soep.train.c2$G_typ1hh2) <- c("1-Pers.", "Couple w/o ch.", "\nSingle P.", "ch. < 16", "ch. > 16", "ch. <> 16", "Other") names(soep.train.c2)[names(soep.train.c2)=="G_psample"] <- "SOEP_sample" names(soep.train.c2)[names(soep.train.c2)=="G_typ1hh2"] <- "HH_type" names(soep.train.c2)[names(soep.train.c2)=="y_soep"] <- "SOEP_years" names(soep.train.c2)[names(soep.train.c2)=="G_contacts"] <- "Contacts" names(soep.train.c2)[names(soep.train.c2)=="i_age"] <- "Int_age" names(soep.train.c2)[names(soep.train.c2)=="D_resp_m2"] <- "Resp_2012" names(soep.train.c2)[names(soep.train.c2)=="i_lengthint"] <- "Int_lenght" names(soep.train.c2)[names(soep.train.c2)=="D_hincmis"] <- "HH_Inc_missing" ct0 <- ctree(D_response3 ~ SOEP_years + Contacts + G_mode2 + D_partref + D_contact + Resp_2012 + missratio + i_gender + Int_age + i_experience + i_responserate + Int_lenght + SOEP_sample + age + education + D_gender + G_migback + G_empl2 + HH_Inc_missing + hinc_m + D_owner + HH_type + G_famstd + hhgr + health + swb + D_east + D_urban + G_area, data = soep.train.c2, mincriterion = 0.999, minbucket = 20, maxdepth = 4) ct0 pdf("g2_ctree0.pdf", width = 11.25, height = 7.5) plot(ct0, gp = gpar(fontsize = 8.25), ip_args = list(id=F), ep_args = list(justmin=2), tp_args = list(mainlab="")) dev.off() # Bigger CTREE (section 4.2) grid <- expand.grid(mincriterion = c(0.99, 0.95, 0.90, 0.85, 0.75)) set.seed(533) ct4 <- train(D_response3 ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_m + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib, data = soep.train.c, method = "ctree", trControl = ctrl3, tuneGrid = grid, metric = "ROC") ct4 plot(ct4) confusionMatrix(ct4) ## 06: Random Forest grid <- expand.grid(mtry = 1:12*4) set.seed(533) rf4 <- train(D_response3 ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_m + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib, data = soep.train.c, method = "rf", trControl = ctrl3, tuneGrid = grid, metric = "ROC") rf4 plot(rf4) ## 07: Gradient Boosting grid <- expand.grid(max_depth = c(7, 8, 9, 10), nrounds = c(1000, 1500, 2000, 2500), eta = c(0.01, 0.05), min_child_weight = 5, subsample = 0.7, gamma = 0, colsample_bytree = 1) set.seed(533) xgb4 <- train(D_response3 ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_m + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib, data = soep.train.c, method = "xgbTree", trControl = ctrl5, tuneGrid = grid, metric = "ROC") xgb4 plot(xgb4) confusionMatrix(xgb4) ## 08: BART bestmin <- function (x, metric, maximize) # caret mixed up levels { bestIter <- if (maximize) which.min(x[, metric]) else which.max(x[, metric]) bestIter } grid <- expand.grid(num_trees = c(50, 200), k = c(1, 2, 3, 5), alpha = 0.95, beta = 2, nu = 3) set.seed(533) bart4 <- train(D_response3 ~ y_soep + G_contacts + G_mode2 + D_partref + D_contact + D_resp_m2 + missratio + i_gender + i_age + i_experience + i_responserate + i_lengthint + G_psample + age + education + D_gender + G_migback + G_empl2 + D_hincmis + hinc_m + D_owner + G_typ1hh2 + G_famstd + hhgr + health + swb + D_east + D_urban + G_area + pbleib, data = soep.train.c, method = "bartMachine", trControl = ctrl6, tuneGrid = grid, num_burn_in = 500, num_iterations_after_burn_in = 2000, mem_cache_for_speed = FALSE, serialize = TRUE, metric = "ROC") bart4 plot(bart4) confusionMatrix(bart4) plot_convergence_diagnostics(bart4$finalModel) ######################### Evaluate and compare ######################### ## 09: Variable Importances and PDPs logitimp4 <- varImp(logit4) ct4imp4 <- varImp(ct4) rfimp4 <- varImp(rf4) xgbimp4 <- varImp(xgb4) bartimp4 <- varImp(bart4) # g2_varImp1.pdf trellis.device(color = F) logitimp4 plot(logitimp4, top = 20, scales = list(y = list(labels = c("HH type: Single P.","Missing ratio","HH type: Other","Marital: married, sep.","Int.: Age","Empl.: Part-Time","Region: East","Migration: indirect","HH type: ch. < & > 16","Contact: mail & phone", "Migration: direct","Contact: mail | phone","Int.: Response rate","Empl.: Not empl.","Contacts: 4-6","Sample: L","Education","SOEP years","HH-Inc.: Missing","Contacts: 7+"), cex=1.1))) # g2_varImp2.pdf trellis.device(color = F) ct4imp4 plot(ct4imp4, top = 20, scales = list(y = list(labels = c("Int.: Age","Area: Urban","Contact: mail, phone","Home: Owner","Region: East","Health","Inv. Staying Prob.","Empl.","Education","Int.: Response rate","Contacts: 4-6, 7+","2012: Interview, Nonresp.", "Mode","Migration: direct, indirect","HH type","HH size","Int.: Experience","Age","SOEP years","Sample"), cex=1.1))) # g2_varImp3.pdf trellis.device(color = F) rfimp4 plot(rfimp4, top = 20, scales = list(y = list(labels = c("House: 1-2 Fam. House","House: Apt. 3-8 Unit","Area: Urban","Home: Owner","Int.: Female","Gender: Female","Contacts: 4-6","SOEP years","Health","HH size","SWB","Missing ratio","Int.: Experience","Education", "Inv. Staying Prob.","Int.: Age","Int.: Response rate","Age","Int.: Length","HH-Inc."), cex=1.1))) # g2_varImp4.pdf trellis.device(color = F) xgbimp4 plot(xgbimp4, top = 20, scales = list(y = list(labels = c("Sample: L","House: Apt. 3-8 Unit","House: 1-2 Fam. House","Home: Owner","Contacts: 4-6","Int.: Female","Sample: H-K","Health","HH size","SWB","Education","SOEP years","Int.: Experience","Missing ratio", "Inv. Staying Prob.","Int.: Age","Age","Int.: Response rate","Int.: Length","HH-Inc."), cex=1.1))) # g2_varImp5.pdf trellis.device(color = F) bartimp4 plot(bartimp4, top = 20, scales = list(y = list(labels = c("Health","HH type: Other","House: 1-2 Fam. House","Refusal in HH","Empl.: Not empl.","Marital: unmarried","Age","Int.: Age","Sample: L","Region: East", "Migration: direct","Education","Contacts: 7+","Int.: Length","HH-Inc.: Missing","Contacts: 4-6","Missing ratio","Int.: Response rate","Marital: married, sep.","SOEP years"), cex=1.1))) colfunc <- colorRampPalette(c("white", "gray15")) cols <- colfunc(100) # g2_pdp1.pdf trellis.par.set(regions=list(col=cols)) pd1 <- partial(rf4, pred.var = c("age", "i_age"), train = soep.train.c, type = "classification", which.class = 2, prob = T, trim.outliers = T, grid.resolution = 50, progress = "text") plotPartial(pd1, levelplot = F, drape = T, colorkey = F, screen = list(z = 45, x = -60), scales = list(col="black"), par.settings = list(axis.line = list(col = "transparent")), zlab=list(label=expression(hat(y)), cex=1.15), xlab=list(label="Age", cex=1.15), ylab=list(label="Int.: Age", cex=1.15), zoom = 0.93) # g2_pdp2.pdf trellis.par.set(regions=list(col=cols)) pd2 <- partial(rf4, pred.var = c("education", "i_lengthint"), train = soep.train.c, type = "classification", which.class = 2, prob = T, trim.outliers = T, grid.resolution = 50, progress = "text") plotPartial(pd2, levelplot = F, drape = T, colorkey = F, screen = list(z = 45, x = -60), scales = list(col="black"), par.settings = list(axis.line = list(col = "transparent")), zlab=list(label=expression(hat(y)), cex=1.15), xlab=list(label="Education", cex=1.15), ylab=list(label="Int.: Length", cex=1.15), zoom = 0.93) # g2_pdp3.pdf trellis.par.set(regions=list(col=cols)) pd3 <- partial(rf4, pred.var = c("hinc_m", "missratio"), train = soep.train.c, type = "classification", which.class = 2, prob = T, trim.outliers = T, grid.resolution = 50, progress = "text") plotPartial(pd3, levelplot = F, drape = T, colorkey = F, screen = list(z = 45, x = -60), scales = list(col="black"), par.settings = list(axis.line = list(col = "transparent")), zlab=list(label=expression(hat(y)), cex=1.15), xlab=list(label="HH-Inc.", cex=1.15), ylab=list(label="Missing ratio", cex=1.15), zoom = 0.93) # g2_pdp4.pdf trellis.par.set(regions=list(col=cols)) pd4 <- partial(rf4, pred.var = c("y_soep", "i_responserate"), train = soep.train.c, type = "classification", which.class = 2, prob = T, trim.outliers = T, grid.resolution = 50, progress = "text") plotPartial(pd4, levelplot = F, drape = T, colorkey = F, screen = list(z = -45, x = -60), scales = list(col="black"), par.settings = list(axis.line = list(col = "transparent")), zlab=list(label=expression(hat(y)), cex=1.15), xlab=list(label="SOEP years", cex=1.15), ylab=list(label="Int.: Response rate", cex=1.15), zoom = 0.93) ## 10: Prediction in test set # Predict classes and probabilities p.logit4 <- predict(logit4, newdata = soep.test.c) pr.logit4 <- predict(logit4, newdata = soep.test.c, type = "prob") pr.glmt4 <- predict(glmtree4, newdata = soep.test.c, type = "response") p.glmt4 <- ifelse(pr.glmt4 > 0.5, "refusal", "interview") p.ct4 <- predict(ct4, newdata = soep.test.c) pr.ct4 <- predict(ct4, newdata = soep.test.c, type = "prob") p.rf4 <- predict(rf4, newdata = soep.test.c) pr.rf4 <- predict(rf4, newdata = soep.test.c, type = "prob") p.xgb4 <- predict(xgb4, newdata = soep.test.c) pr.xgb4 <- predict(xgb4, newdata = soep.test.c, type = "prob") p.bart4 <- predict(bart4, newdata = soep.test.c) pr.bart4 <- predict(bart4, newdata = soep.test.c, type = "prob") # ROC and PR curves fg1 <- pr.logit4$refusal[soep.test.c$D_response3 == "refusal"] bg1 <- pr.logit4$refusal[soep.test.c$D_response3 == "interview"] fg3 <- pr.ct4$refusal[soep.test.c$D_response3 == "refusal"] bg3 <- pr.ct4$refusal[soep.test.c$D_response3 == "interview"] fg4 <- pr.rf4$refusal[soep.test.c$D_response3 == "refusal"] bg4 <- pr.rf4$refusal[soep.test.c$D_response3 == "interview"] fg5 <- pr.xgb4$refusal[soep.test.c$D_response3 == "refusal"] bg5 <- pr.xgb4$refusal[soep.test.c$D_response3 == "interview"] fg6 <- pr.glmt4[soep.test.c$D_response3 == "refusal"] bg6 <- pr.glmt4[soep.test.c$D_response3 == "interview"] fg7 <- pr.bart4$interview[soep.test.c$D_response3 == "refusal"] # caret mixed up levels bg7 <- pr.bart4$interview[soep.test.c$D_response3 == "interview"] # caret mixed up levels logit4.roc <- roc.curve(scores.class0 = fg1, scores.class1 = bg1, curve = T) ct4.roc <- roc.curve(scores.class0 = fg3, scores.class1 = bg3, curve = T) rf4.roc <- roc.curve(scores.class0 = fg4, scores.class1 = bg4, curve = T) xgb4.roc <- roc.curve(scores.class0 = fg5, scores.class1 = bg5, curve = T) glmt4.roc <- roc.curve(scores.class0 = fg6, scores.class1 = bg6, curve = T) bart4.roc <- roc.curve(scores.class0 = fg7, scores.class1 = bg7, curve = T) p1 <- xyplot(logit4.roc$curve[,2] ~ logit4.roc$curve[,1], type = "l", lwd = 2, col = "black", xlab = list(label = "1 - Specificity", cex = 1.25), ylab = list(label = "Sensitivity", cex = 1.25), key = list(columns = 2, text = list(lab = c("Logit","MOB","CTREE","RF","XGBoost","BART")), cex = 1.25, lines = list(type = "l", lwd = 2, col = c("black", "gray", "#1B9E77", "#7570B3", "#E6AB02", "#D95F02"))), scales = list(cex = 1.25), panel = function(x, ...){ panel.xyplot(x, ...) panel.abline(0,1)}) p2 <- xyplot(glmt4.roc$curve[,2] ~ glmt4.roc$curve[,1], type = "l", lwd = 2, col = "gray") p3 <- xyplot(ct4.roc$curve[,2] ~ ct4.roc$curve[,1], type = "l", lwd = 2, col = "#1B9E77") p4 <- xyplot(rf4.roc$curve[,2] ~ rf4.roc$curve[,1], type = "l", lwd = 2, col = "#7570B3") p5 <- xyplot(xgb4.roc$curve[,2] ~ xgb4.roc$curve[,1], type = "l", lwd = 2, col = "#E6AB02") p6 <- xyplot(bart4.roc$curve[,2] ~ bart4.roc$curve[,1], type = "l", lwd = 2, col = "#D95F02") # ROC plot x11(width=7, height=7) p1 + as.layer(p2) + as.layer(p3) + as.layer(p4) + as.layer(p5) + as.layer(p6) logit4.pr <- pr.curve(scores.class0 = fg1, scores.class1 = bg1, curve = T) ct4.pr <- pr.curve(scores.class0 = fg3, scores.class1 = bg3, curve = T) rf4.pr <- pr.curve(scores.class0 = fg4, scores.class1 = bg4, curve = T) xgb4.pr <- pr.curve(scores.class0 = fg5, scores.class1 = bg5, curve = T) glmt4.pr <- pr.curve(scores.class0 = fg6, scores.class1 = bg6, curve = T) bart4.pr <- pr.curve(scores.class0 = fg7, scores.class1 = bg7, curve = T) p1 <- xyplot(logit4.pr$curve[,2] ~ logit4.pr$curve[,1], type = "l", lwd = 2, col = "black", xlab = list(label = "Recall", cex = 1.25), ylab = list(label = "Precision", cex = 1.25), ylim = c(-0.07,1.07), scales = list(cex = 1.25), key = list(columns = 2, text = list(lab = c("Logit","MOB","CTREE","RF","XGBoost","BART")), cex = 1.25, lines = list(type = "l", lwd = 2, col = c("black", "gray", "#1B9E77", "#7570B3", "#E6AB02", "#D95F02")))) p2 <- xyplot(glmt4.pr$curve[,2] ~ glmt4.pr$curve[,1], type = "l", lwd = 2, col = "gray") p3 <- xyplot(ct4.pr$curve[,2] ~ ct4.pr$curve[,1], type = "l", lwd = 2, col = "#1B9E77") p4 <- xyplot(rf4.pr$curve[,2] ~ rf4.pr$curve[,1], type = "l", lwd = 2, col = "#7570B3") p5 <- xyplot(xgb4.pr$curve[,2] ~ xgb4.pr$curve[,1], type = "l", lwd = 2, col = "#E6AB02") p6 <- xyplot(bart4.pr$curve[,2] ~ bart4.pr$curve[,1], type = "l", lwd = 2, col = "#D95F02") # PR plot x11(width=7, height=7) p1 + as.layer(p2) + as.layer(p3) + as.layer(p4) + as.layer(p5) + as.layer(p6) # Threshold optimization logit4.roc <- roc(soep.test.c$D_response3, pr.logit4$refusal) logit4.t0 <- coords(logit4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1)) logit4.t1 <- coords(logit4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1)) pt0.logit4 <- as.factor(ifelse(pr.logit4$refusal > logit4.t0[1], "refusal", "interview")) pt1.logit4 <- as.factor(ifelse(pr.logit4$refusal > logit4.t1[1], "refusal", "interview")) ct4.roc <- roc(soep.test.c$D_response3, pr.ct4$refusal) ct4.t0 <- coords(ct4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1)) ct4.t1 <- coords(ct4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1)) pt0.ct4 <- as.factor(ifelse(pr.ct4$refusal > ct4.t0[1], "refusal", "interview")) pt1.ct4 <- as.factor(ifelse(pr.ct4$refusal > ct4.t1[1], "refusal", "interview")) rf4.roc <- roc(soep.test.c$D_response3, pr.rf4$refusal) rf4.t0 <- coords(rf4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1)) rf4.t1 <- coords(rf4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1)) pt0.rf4 <- as.factor(ifelse(pr.rf4$refusal > rf4.t0[1], "refusal", "interview")) pt1.rf4 <- as.factor(ifelse(pr.rf4$refusal > rf4.t1[1], "refusal", "interview")) xgb4.roc <- roc(soep.test.c$D_response3, pr.xgb4$refusal) xgb4.t0 <- coords(xgb4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1)) xgb4.t1 <- coords(xgb4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1)) pt0.xgb4 <- as.factor(ifelse(pr.xgb4$refusal > xgb4.t0[1], "refusal", "interview")) pt1.xgb4 <- as.factor(ifelse(pr.xgb4$refusal > xgb4.t1[1], "refusal", "interview")) glmt4.roc <- roc(soep.test.c$D_response3, pr.glmt4) glmt4.t0 <- coords(glmt4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1)) glmt4.t1 <- coords(glmt4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1)) pt0.glmt4 <- as.factor(ifelse(pr.glmt4 > glmt4.t0[1], "refusal", "interview")) pt1.glmt4 <- as.factor(ifelse(pr.glmt4 > glmt4.t1[1], "refusal", "interview")) bart4.roc <- roc(soep.test.c$D_response3, pr.bart4$interview) bart4.t0 <- coords(bart4.roc, x = "best", best.method = "closest.topleft", best.weights = c(1, 0.1)) bart4.t1 <- coords(bart4.roc, x = "best", best.method = "closest.topleft", best.weights = c(0.5, 0.1)) pt0.bart4 <- as.factor(ifelse(pr.bart4$interview > bart4.t0[1], "refusal", "interview")) pt1.bart4 <- as.factor(ifelse(pr.bart4$interview > bart4.t1[1], "refusal", "interview")) cm10 <- confusionMatrix(pt0.logit4, soep.test.c$D_response3, positive = "refusal", mode = "everything") cm30 <- confusionMatrix(pt0.ct4, soep.test.c$D_response3, positive = "refusal", mode = "everything") cm40 <- confusionMatrix(pt0.rf4, soep.test.c$D_response3, positive = "refusal", mode = "everything") cm50 <- confusionMatrix(pt0.xgb4, soep.test.c$D_response3, positive = "refusal", mode = "everything") cm60 <- confusionMatrix(pt0.glmt4, soep.test.c$D_response3, positive = "refusal", mode = "everything") cm70 <- confusionMatrix(pt0.bart4, soep.test.c$D_response3, positive = "refusal", mode = "everything") cm11 <- confusionMatrix(pt1.logit4, soep.test.c$D_response3, positive = "refusal", mode = "everything") cm31 <- confusionMatrix(pt1.ct4, soep.test.c$D_response3, positive = "refusal", mode = "everything") cm41 <- confusionMatrix(pt1.rf4, soep.test.c$D_response3, positive = "refusal", mode = "everything") cm51 <- confusionMatrix(pt1.xgb4, soep.test.c$D_response3, positive = "refusal", mode = "everything") cm61 <- confusionMatrix(pt1.glmt4, soep.test.c$D_response3, positive = "refusal", mode = "everything") cm71 <- confusionMatrix(pt1.bart4, soep.test.c$D_response3, positive = "refusal", mode = "everything") Logit1 <- c(logit4.roc$auc, logit4.pr$auc.integral) ctree <- c(ct4.roc$auc, ct4.pr$auc.integral) RF <- c(rf4.roc$auc, rf4.pr$auc.integral) GBM <- c(xgb4.roc$auc, xgb4.pr$auc.integral) glmtree <- c(glmt4.roc$auc, glmt4.pr$auc.integral) bart <- c(bart4.roc$auc, bart4.pr$auc.integral) tab <- xtable(t(rbind(Logit1, glmtree, ctree, RF, GBM, bart)), digits = 3) print(tab, type="latex", file="2_models4_test2.tex") Logit1 <- c(cm10$overall[1], cm10$byClass[c(1:2,5,7)], cm10$overall[2]) ctree <- c(cm30$overall[1], cm30$byClass[c(1:2,5,7)], cm30$overall[2]) RF <- c(cm40$overall[1], cm40$byClass[c(1:2,5,7)], cm40$overall[2]) GBM <- c(cm50$overall[1], cm50$byClass[c(1:2,5,7)], cm50$overall[2]) glmtree <- c(cm60$overall[1], cm60$byClass[c(1:2,5,7)], cm60$overall[2]) bart <- c(cm70$overall[1], cm70$byClass[c(1:2,5,7)], cm70$overall[2]) tab <- xtable(rbind(Logit1, glmtree, ctree, RF, GBM, bart), digits = 3) print(tab, type="latex", file="2_models4_test20.tex") Logit1 <- c(cm11$overall[1], cm11$byClass[c(1:2,5,7)], cm11$overall[2]) ctree <- c(cm31$overall[1], cm31$byClass[c(1:2,5,7)], cm31$overall[2]) RF <- c(cm41$overall[1], cm41$byClass[c(1:2,5,7)], cm41$overall[2]) GBM <- c(cm51$overall[1], cm51$byClass[c(1:2,5,7)], cm51$overall[2]) glmtree <- c(cm61$overall[1], cm61$byClass[c(1:2,5,7)], cm61$overall[2]) bart <- c(cm71$overall[1], cm71$byClass[c(1:2,5,7)], cm71$overall[2]) tab <- xtable(rbind(Logit1, glmtree, ctree, RF, GBM, bart), digits = 3) print(tab, type="latex", file="2_models4_test21.tex")