####################################################################################################
# Prediction of wave 14 participation probabilities based on wave 13
####################################################################################################

# Load data

####### Apply ml model on wave 13 data
load("data_w13.rdata")

# mice imputation
cols <- c("zpsex", "PB0100", "PD0200", "PD0500", "PET0510", "PEK1400", "PEK1450", "PSK0100", "PSK0400a", "PSK0400b", "PSK0400c", "PSK0400d", "PSK0400e", "PG0500", "PG0800", "PP0110", "PMI0100", "ekinu18", "etakt", "alakt", "partner", "zuspiel", "schul2", "beruf2", "hintmod", "hsprache", "HW0300", "HEK0100", "HEK0120", "HEK0200", "HEK0400", "HEK2000", "HEK2200", "kindu4", "kindu15", "alg2abez", "umzug", "bundesld", "sample_kat", "interesse", "verstaendnis", "zuverlaessig", "schwierigkeit", "HLS2300a_mis", "HLS2400a_mis", "HEK0600_mis", "HEK1200_mis", "HEK1420_mis", "PA0900_mis", "int_sex", "int_schul")
data_w13[cols] <- lapply(data_w13[cols], factor)
sapply(data_w13, class)

drops <- c("hnr", "mis_pnr", "HLS2300a_mis", "HLS2400a_mis", "HEK0600_mis", "HEK1200_mis", "HEK1420_mis", "PA0900_mis")
testData1 <- data_w13[ , !(names(data_w13) %in% drops)]
testData <- mice(testData1, m=1, maxit=5, seed=1234, nnet.MaxNWts = 2500) # maxit = 5 

dataImpTest1 <- mice::complete(testData, 1)
dataImpTest <- cbind(dataImpTest1, data_w13[ , (names(data_w13) %in% drops)])

# identify numerical variables
numeric_vars <- vapply(dataImpTest, is.numeric, logical(1))

# identify factor variables
factor_vars <- vapply(dataImpTest, is.factor, logical(1))

# one-hot-encode factors
factors_as_dummies <- model.matrix( ~ . -1, dataImpTest[, factor_vars])

# combine non-factor variables with dummy-coded factors
dataTest_ml <- cbind(dataImpTest[, !factor_vars], factors_as_dummies[,2:ncol(factors_as_dummies)])
dataTest_ml$hsprache2 <- 0

# Load model
load("DATA/AllModelsPaper.RData")

# Select Model
model1 <- models$rf

# Preparation
data_ml$welle <- NULL
levels(data_ml$ausfall) <- make.names(levels(factor(data_ml$ausfall)))
ctrl_fix <- trainControl(method = "none", classProbs=TRUE, sampling = "down") #, sampling = "down")
rfGrid <- model1$bestTune

set.seed(1234)
system.time(models$rf_fix <- train(ausfall~., 
                                             data = data_ml,
                                             method = "ranger",
                                             metric = performance_metric,
                                             preProcess = c("center", "scale"),
                                             trControl = ctrl_fix,
                                             tuneGrid = rfGrid))

# Validation
valid_pred <- predict(models$rf_fix, dataTest_ml, type = "prob")
X0median <- median(valid_pred$X0)
valid_pred$Ausfall <- ifelse(valid_pred$X0 <= X0median, 1, 0)
valid_pred$hnr <- dataTest_ml$hnr
valid_pred$hintmod <- dataTest_ml$hintmod

# Save data as .dta for stata
write.dta(valid_pred, "DATA/predictions_w13.dta")
