# Note: file locations and pathways may be subject to change. 

#### Libraries #### 
# Loading data: 
library(haven) # Load SPSS files 

# Data manipulation: 
library(dplyr) # Easy data manipulation 

# Analyses 
library(margins) # Average marginal effects 
library(survival) # Survival analysis
library(Hmisc) # Weighted means, quartiles, and standard deviations
source("RISQ_R-indicators_v2.1.r", local = TRUE) # R indicators 

# Plots: 
library(ggplot2) # Plots 
library(ggpubr) # Customizing plots created with ggplot2
library(ggcorrplot) # Correlation plot using ggplot2
library(survminer) # survival plot using ggplot2

#### Open workspace ####
load("DataPrep.RData") # Load workspace created and saved in DataPrep.R
# OR: 
#load("Analyses.RData") # Load workspace saved from this file 

#### Load additional data ####
## 2020 web-based sample
# Information on response: 
BO_2020_respons <- read_sav("BO 2020 respons.sav") # Load response outcomes for 2020 HBS

#### Prep ####
R <- 10000 # Set number of Bootstrap samples to 10000
combined_resp_2021$age <- droplevels(combined_resp_2021$age) # Drop unused age levels (unknown) from combined sample 
lu_resp_2021$age <- droplevels(lu_resp_2021$age) # Drop unused age levels (unknown) from Luxembourg sample 

combined_resp_2021 <- as.data.frame(combined_resp_2021) # Make sure dataset is stored as dataframe

save.image("Analyses.RData") # Save workspace 

#### Calculate sample sizes #### 
nrow(nl_resp_2021) # NL 2021

#### Registration and Activity in the App-Based HBS Compared to the Web-Based HBS (RQ 1) ####
##### Web-based sample #####
###### Registration rate: ######
sum(BO_2020_respons$AGcompl) # Number of registered households
mean(BO_2020_respons$AGcompl)*100 #  Registration rate 

nl2020_registered <- NULL # Storage for Bootstrap, 2020 registration rate 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  
  # Create resampled dataset from the 2020 data:  
  sample_d = BO_2020_respons[sample(1:nrow(BO_2020_respons), nrow(BO_2020_respons), replace = TRUE), ]
  
  # Calculate the registration rate in the 2020 sample: 
  model_bootstrap <- mean(sample_d$AGcompl)
  
  # Save the results:  
  nl2020_registered <- c(nl2020_registered, model_bootstrap)
}

mean(nl2020_registered)*100 # Bootstrapped mean 
quantile(nl2020_registered, probs = c(0.025, 0.975))*100 # Bootstrapped CI 
hist(nl2020_registered) # Distribution of Bootstrapped statistic 
save.image("Analyses.RData") # Save workspace

###### Activity rate: ######
BO_2020_respons$active <- NA # Create active variable 
BO_2020_respons$active[BO_2020_respons$ActiveW1 > 0 | 
                         BO_2020_respons$ActiveW2 > 0 |
                         BO_2020_respons$ActiveW3 > 0 |
                         BO_2020_respons$ActiveW4 > 0] <- 1 # Household is active if they reported at least one purchase in one of the weeks 
BO_2020_respons$active[is.na(BO_2020_respons$active)] <- 0 # Household is not active if they never reported a purchase 

sum(BO_2020_respons$active) # Number of active households 
mean(BO_2020_respons$active)*100 # Activity rate 

nl2020_active <- NULL # Storage for Bootstrap, 2020 activity rate 
set.seed(2022) # Set seed
for(r in 1:R){ # Bootstrap
  
  # Create resampled dataset from the 2020 dataset: 
  sample_d = BO_2020_respons[sample(1:nrow(BO_2020_respons), nrow(BO_2020_respons), replace = TRUE), ]
  
  # Calculate activity rate in Bootstrapped samples:  
  model_bootstrap <- mean(sample_d$active)
  
  # Save the results: 
  nl2020_active <- c(nl2020_active, model_bootstrap)
}

mean(nl2020_active)*100 # Bootstrapped mean 
quantile(nl2020_active, probs = c(0.025, 0.975))*100 # Bootstrapped CI 
hist(nl2020_active) # Distribution of Bootstrapped statistic 

save.image("Analyses.RData") # Save workspace

###### Completion rate: ######
correction_2020$complete <- 1 # Add completion variable in file containing all households that completed the 2020 HBS
BO_2020_respons <- merge(BO_2020_respons, correction_2020[,c("UserID", "complete")], by = "UserID", all.x = TRUE) # Add completion variable to response file
BO_2020_respons$complete[is.na(BO_2020_respons$complete)] <- 0 # Set NA to 0 for households that did not complete the survey

sum(BO_2020_respons$complete) # Number of households that completed the survey 
mean(BO_2020_respons$complete)*100 # Completion rate 

nl2020_complete <- NULL # Storage for Bootstrap, 2020 completion rate 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap
  
  # Create resampled dataset from the 2020 dataset:
  sample_d = BO_2020_respons[sample(1:nrow(BO_2020_respons), nrow(BO_2020_respons), replace = TRUE), ]
  
  # Calculate completion rate in Bootstrapped samples: 
  model_bootstrap <- mean(sample_d$complete) 
  
  # Save the results:  
  nl2020_complete <- c(nl2020_complete, model_bootstrap)
}

mean(nl2020_complete)*100 # Bootstrapped mean 
quantile(nl2020_complete, probs = c(0.025, 0.975))*100 # Bootstrapped CI
hist(nl2020_complete) # Distribution of Bootstrapped statistic

save.image("Analyses.RData") # Save workspace

###### Conditional activity rate: ######
(sum(BO_2020_respons$active)/sum(BO_2020_respons$AGcompl))*100 # Conditional activity rate

nl2020_ar <- NULL # Storage for Bootstrap, 2020 conditional activity rate
set.seed(2022) # Set seed
for(r in 1:R){ # Bootstrap 
  
  # Create resampled dataset from the 2020 dataset
  sample_d = BO_2020_respons[sample(1:nrow(BO_2020_respons), nrow(BO_2020_respons), replace = TRUE), ]
  
  # Calculate conditional activity rate in Bootstrapped samples:  
  model_bootstrap <- (sum(sample_d$active)/sum(sample_d$AGcompl))
  
  # Save the results: 
  nl2020_ar <- c(nl2020_ar, model_bootstrap)
}

mean(nl2020_ar)*100 # Bootstrapped mean
quantile(nl2020_ar, probs = c(0.025, 0.975))*100 # Bootstrapped CI 
hist(nl2020_ar) # Distribution of Bootstrapped statistics 

save.image("Analyses.RData") # Save workspace

###### Conditional completion rate: ######
(sum(BO_2020_respons$complete)/sum(BO_2020_respons$active))*100 # Conditional completion rate 

nl2020_ca <- NULL # Storage for Bootstrap, 2020 conditional completion rate 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  
  # Create resampled dataset from the 2020 dataset: 
  sample_d = BO_2020_respons[sample(1:nrow(BO_2020_respons), nrow(BO_2020_respons), replace = TRUE), ]
  
  # Calculate conditional completion rate in Bootstrapped samples:  
  model_bootstrap <- (sum(sample_d$complete)/sum(sample_d$active))
  
  # Save the results:  
  nl2020_ca <- c(nl2020_ca, model_bootstrap)
}

mean(nl2020_ca)*100 # Bootstrapped mean 
quantile(nl2020_ca, probs = c(0.025, 0.975))*100 # Bootstrapped CI 
hist(nl2020_ca) # Distribution of Bootstrapped statistics 

save.image("Analyses.RData") # Save workspace

##### 2021 NL #####
###### Registration rate: ######
sum(nl_resp_2021$registered) # Number of registered households
mean(nl_resp_2021$registered)*100 # Registration rate 

nl_registered <- NULL # Storage for Bootstrap, NL 2021 registration rate 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap
  
  # Create resampled dataset from the 2021 NL dataset: 
  sample_d = nl_resp_2021[sample(1:nrow(nl_resp_2021), nrow(nl_resp_2021), replace = TRUE), ]
  
  # Calculate registration rate in Bootstrapped samples:  
  model_bootstrap <- (mean(sample_d$registered))
  
  # Save the results: 
  nl_registered <- c(nl_registered, model_bootstrap)
}

mean(nl_registered)*100 # Bootstrapped mean 
quantile(nl_registered, probs = c(0.025, 0.975))*100 # Bootstrapped CI 
hist(nl_registered) # Distribution of Bootstrapped statistic 

save.image("Analyses.RData") # Save workspace

###### Activity rate: ######
sum(nl_resp_2021$active) # Number of active households 
mean(nl_resp_2021$active)*100 # Activity rate 

nl_active <- NULL # Storage for Bootstrap, NL 2021 activity rate 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap
  
  # Create resampled dataset from the 2021 NL dataset: 
  sample_d = nl_resp_2021[sample(1:nrow(nl_resp_2021), nrow(nl_resp_2021), replace = TRUE), ]
  
  # Calculate activity rate in Bootstrapped samples: 
  model_bootstrap <- (mean(sample_d$active))
  
  # Save the results: 
  nl_active <- c(nl_active, model_bootstrap)
}

mean(nl_active)*100 # Bootstrapped mean 
quantile(nl_active, probs = c(0.025, 0.975))*100 # Bootstrapped CI 
hist(nl_active) # Distribution of Bootstrapped statistic

save.image("Analyses.RData") # Save workspace

###### Completion rate: ######
sum(nl_resp_2021$complete) # Number of households that completed the survey 
mean(nl_resp_2021$complete)*100 # Completion rate 

nl_complete <- NULL # Storage for Bootstrap, NL 2021 completion rate
set.seed(2022) # Set seed
for(r in 1:R){ # Bootstrap 
  
  # Create resampled dataset from the 2021 NL dataset:  
  sample_d = nl_resp_2021[sample(1:nrow(nl_resp_2021), nrow(nl_resp_2021), replace = TRUE), ]
  
  # Calculate completion rate in Bootstrapped samples: 
  model_bootstrap <- (mean(sample_d$complete))
  
  # Save the results: 
  nl_complete <- c(nl_complete, model_bootstrap)
}

mean(nl_complete)*100 # Bootstrapped mean 
quantile(nl_complete, probs = c(0.025, 0.975))*100 # Bootstrapped CI
hist(nl_complete)

save.image("Analyses.RData") # Save workspace

###### Conditional activity rate: ######
(sum(nl_resp_2021$active)/sum(nl_resp_2021$registered))*100 # Conditional activity rate 

nl_ar <- NULL # Storage for Bootstrap, NL 2021 conditional activity rate  
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap
  
  # Create resampled dataset from the 2021 NL dataset: 
  sample_d = nl_resp_2021[sample(1:nrow(nl_resp_2021), nrow(nl_resp_2021), replace = TRUE), ]
  
  # Calculate conditional activity rate in Bootstrapped samples: 
  model_bootstrap <- (sum(sample_d$active)/sum(sample_d$registered))
  
  # Save the results:  
  nl_ar <- c(nl_ar, model_bootstrap)
}

mean(nl_ar)*100 # Bootstrapped mean
quantile(nl_ar, probs = c(0.025, 0.975))*100 # Bootstrapped CI 
hist(nl_ar)

save.image("Analyses.RData") # Save workspace

###### Conditional completion rate: ######
(sum(nl_resp_2021$complete)/sum(nl_resp_2021$active))*100 # Conditional completion rate 

nl_ca <- NULL # Storage for Bootstrap, NL 2021 conditional completion rate 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  
  # Create resampled dataset from the 2021 NL dataset: 
  sample_d = nl_resp_2021[sample(1:nrow(nl_resp_2021), nrow(nl_resp_2021), replace = TRUE), ]
  
  # Calculate conditional completion rate in Bootstrapped samples:  
  model_bootstrap <- (sum(sample_d$complete)/sum(sample_d$active))
  
  # Save the results:  
  nl_ca <- c(nl_ca, model_bootstrap)
}

mean(nl_ca)*100 # Bootstrapped mean 
quantile(nl_ca, probs = c(0.025, 0.975))*100 # Bootstrapped CI 
hist(nl_ca)

save.image("Analyses.RData") # Save workspace

#### Representativeness of App-Based and Web-Based HBS (RQ 1) ####
##### Web-based sample #####
nl_pop_2020_w$crossed <- NA # Create empty variable 
nl_pop_2020_w$crossed <- interaction(nl_pop_2020_w$age, nl_pop_2020_w$hhsize) # Create variable that crosses all levels of age with all levels of household size 
table(nl_pop_2020_w$crossed) # Check new variable 

nl_pop_2020_w <- nl_pop_2020_w %>% 
  group_by(crossed) %>%
  mutate(avg_resp = mean(resp_prob)) # Calculate average response probability per level of the new crossed variable 

rho_bar <- (1/sum(nl_pop_2020_w$eindw))*(sum(nl_pop_2020_w$eindw * nl_pop_2020_w$avg_resp)) # Calculate weighted sample mean of the estimated response probabilities
S_p <- sqrt(1/(sum(nl_pop_2020_w$eindw)-1)*sum(nl_pop_2020_w$eindw*((nl_pop_2020_w$avg_resp-rho_bar)^2))) # Calculate weighted sample standard deviation of the estimated response probabilities
R_ind <- 1-(2*S_p) # Calculate R indicator
R_ind

CV <- S_p/rho_bar # Calculate coefficient of variation.
CV  

R_2020 <- NULL # Storage for Bootstrap, R indicator complete web-based sample 
set.seed(2022) # Set seed
for(r in 1:R){ #Bootstrap 
  
  # Create resampled dataset from the 2020 dataset: 
  sample_d = nl_pop_2020_w[sample(1:nrow(nl_pop_2020_w), nrow(nl_pop_2020_w), replace = TRUE), ]
  
  # Calculate R indicator in Bootstrapped samples: 
  rho_bar <- (1/sum(sample_d$eindw))*(sum(sample_d$eindw * sample_d$avg_resp)) # Calculate weighted sample mean of the estimated response probabilities
  S_p <- sqrt(1/(sum(sample_d$eindw)-1)*sum(sample_d$eindw*((sample_d$avg_resp-rho_bar)^2))) # Calculate weighted sample standard deviation of the estimated response probabilities
  R_Bootstrap <- 1-(2*S_p) # Calculate R indicator
  
  # Save the results: 
  R_2020 <- c(R_2020, R_Bootstrap)
}

mean(R_2020) # Bootstrapped mean 
quantile(R_2020, probs = c(0.025, 0.975)) # Bootstrapped CI 
hist(R_2020) # Distribution of Bootstrapped statistic

save.image("Analyses.RData") # Save workspace

##### 2021 NL #####
###### Logistic regression models: ######
# Registration: 
summary(glm(registered ~ age + hhsize + origin + edulev + urban + homeowner + income, data = nl_resp_2021, family = "binomial")) # Logistic regression model
summary(margins(glm(registered ~ age + hhsize + origin + edulev + urban + homeowner + income, data = nl_resp_2021, family = "binomial"))) # Average marginal effects

# Activity: 
summary(glm(active ~ age + hhsize + origin + edulev + urban + homeowner + income, data = nl_resp_2021, family = "binomial")) # Logistic regression model
summary(margins(glm(active ~ age + hhsize + origin + edulev + urban + homeowner + income, data = nl_resp_2021, family = "binomial"))) # Average marginal effects

###### R indicators: ######
# Registration: 
ir_nl1 <- getRIndicator(formula = registered ~ age + hhsize + origin + edulev + urban + homeowner + income,
                        sampleData = nl_resp_2021, 
                        sampleWeights = rep(3000, nrow(nl_resp_2021)),
                        withPartials = TRUE)

ir_nl1$RUnadj # R indicator
ir_nl1$CVUnadj # Coefficient of variation
ir_nl1$R # R indicator
ir_nl1$CV # Coefficient of variation


ir_nl1$partialR$byVariables # Unconditional partial indicators (PuUnadj) and conditional partial indicators (PcUnadj)

R_nl2021_r <- NULL # Storage for Bootstrap, R indicator for registered 2021 NL sample 
set.seed(2022) # Set seed
for(r in 1:R){ #Bootstrap 
  
  # Create resampled dataset from the 2021 NL dataset: 
  sample_d = nl_resp_2021[sample(1:nrow(nl_resp_2021), nrow(nl_resp_2021), replace = TRUE), ]
  
  # Calculate R indicator in Bootstrapped samples: 
  Model_bootstrap <- getRIndicator(formula = registered ~ age + hhsize + origin + edulev + urban + homeowner + income,
                                   sampleData = sample_d, 
                                   sampleWeights = rep(3000, nrow(sample_d)))
  
  R_Bootstrap <- Model_bootstrap$RUnadj
  
  # Save the results: 
  R_nl2021_r <- c(R_nl2021_r, R_Bootstrap)
}

mean(R_nl2021_r) # Bootstrapped mean
quantile(R_nl2021_r, probs = c(0.025, 0.975)) # Bootstrapped CI 
hist(R_nl2021_r) # Distribution of Bootstrapped statistic 

save.image("Analyses.RData") # Save workspace

# Activity: 
ia_nl1 <- getRIndicator(formula = active ~ age + hhsize + origin + edulev + urban + homeowner + income,
                        sampleData = nl_resp_2021, 
                        sampleWeights = rep(3000, nrow(nl_resp_2021)),
                        withPartials = TRUE)

ia_nl1$RUnadj # R indicator 
ia_nl1$CVUnadj # Coefficient of variation
ia_nl1$R # R indicator 
ia_nl1$CV # Coefficient of variation
ia_nl1$partialR$byVariables # Unconditional partial indicators (PuUnadj) and conditional partial indicators (PcUnadj)

R_nl2021_a <- NULL # Storage for Bootstrap, R indicator for active 2021 NL sample 
set.seed(2022) # Set seed
for(r in 1:R){ #Bootstrap 
  
  # Create resampled dataset from the 2021 NL dataset: 
  sample_d = nl_resp_2021[sample(1:nrow(nl_resp_2021), nrow(nl_resp_2021), replace = TRUE), ]
  
  # Calculate R indicator in Bootstrapped samples: 
  Model_bootstrap <- getRIndicator(formula = active ~ age + hhsize + origin + edulev + urban + homeowner + income,
                                   sampleData = sample_d, 
                                   sampleWeights = rep(3000, nrow(sample_d)))
  
  R_Bootstrap <- Model_bootstrap$RUnadj
  
  # Save the results: 
  R_nl2021_a <- c(R_nl2021_a, R_Bootstrap)
}

mean(R_nl2021_a) # Bootstrapped mean 
quantile(R_nl2021_a, probs = c(0.025, 0.975)) # Bootstrapped CI
hist(R_nl2021_a) # Distribution of Bootstrapped statistic 

save.image("Analyses.RData") # Save workspace

# Completion: 
ic_nl2 <- getRIndicator(formula = complete ~ age + hhsize,
                        sampleData = nl_resp_2021, 
                        sampleWeights = rep(3000, nrow(nl_resp_2021)),
                        withPartials = TRUE)

ic_nl2$RUnadj # R indicator 
ic_nl2$CVUnadj # Coefficient of variation
ic_nl2$R # R indicator 
ic_nl2$CV # Coefficient of variation


R_nl2021_c <- NULL # Storage for Bootstrap, R indicator for complete 2021 NL sample 
set.seed(2022) # Set seed
for(r in 1:R){ #Bootstrap 
  
  # Create resampled dataset from the 2021 NL dataset: 
  sample_d = nl_resp_2021[sample(1:nrow(nl_resp_2021), nrow(nl_resp_2021), replace = TRUE), ]
  
  # Calculate R indicator in Bootstrapped samples: 
  Model_bootstrap <- getRIndicator(formula = complete ~ age + hhsize,
                                   sampleData = sample_d, 
                                   sampleWeights = rep(3000, nrow(sample_d)))
  
  R_Bootstrap <- Model_bootstrap$RUnadj
  
  # Save the results: 
  R_nl2021_c <- c(R_nl2021_c, R_Bootstrap)
}

mean(R_nl2021_c) # Bootstrapped mean 
quantile(R_nl2021_c, probs = c(0.025, 0.975)) # Bootstrapped CI 
hist(R_nl2021_c) # Distribution of Bootstrapped statistic 

save.image("Analyses.RData") # Save workspace


#### Effects of Personalized Insights on Registration, Activity, and Dropout (RQ 2) ####
# Subset immediate and delayed insights into separate datasets: 
immediate <- subset(combined_resp_2021, insights == 1) # Subset households with immediate insights
immediate <- as.data.frame(immediate) # Make sure it is a dataframe
delayed <- subset(combined_resp_2021, insights == 0) # Subset households with delayed insights
delayed <- as.data.frame(delayed) # Make sure it is a dataframe

###### Response rates per condition: ######
# Registration: 
(mean(immediate$registered))*100 # Registration rate in the immediate insights group
(mean(delayed$registered))*100 # Registration rate in the delayed insights group 

# Activity: 
(mean(immediate$active))*100 # Activity rate in the immediate insights group 
(mean(delayed$active))*100 # Activity rate in the delayed insights group  

# Bootstrap: 
imm_r <- NULL # Storage (immediate registered)
imm_a <- NULL # Storage (immediate active)
del_r <- NULL # Storage (delayed registered)
del_a <- NULL # Storage (delayed active)
set.seed(2022) # Set seed
for(r in 1:R){ # Bootstrap:  
  
  # Create resampled dataset from the 2021 combined dataset: 
  sample_d = combined_resp_2021[sample(1:nrow(combined_resp_2021), nrow(combined_resp_2021), replace = TRUE), ]
  sample_d1 <- subset(sample_d, insights == 1) # Immediate insights group 
  sample_d2 <- subset(sample_d, insights == 0) # Delayed insights group 
  
  # Calculate statistics in Bootstrapped samples:  
  model_bootstrap1 <- (mean(sample_d1$registered)) # Registration rate immediate group
  model_bootstrap2 <- (mean(sample_d1$active)) # Activity rate immediate group
  model_bootstrap3 <- (mean(sample_d2$registered)) # Registration rate delayed group
  model_bootstrap4 <- (mean(sample_d2$active)) # Activity rate delayed group 
  
  # Save the results: 
  imm_r <- c(imm_r, model_bootstrap1) # immediate registered
  imm_a <- c(imm_a, model_bootstrap2) # immediate active
  del_r <- c(del_r, model_bootstrap3) # delayed registered
  del_a <- c(del_a, model_bootstrap4) # delayed active 
}

mean(imm_r)*100 # Bootstrapped mean (immediate registered) 
mean(imm_a)*100 # Bootstrapped mean (immediate active) 
mean(del_r)*100 # Bootstrapped mean (delayed registered) 
mean(del_a)*100 # Bootstrapped mean (delayed active) 

quantile(imm_r, probs = c(0.025, 0.975))*100 # Bootstrapped CI (immediate registered) 
quantile(imm_a, probs = c(0.025, 0.975))*100 # Bootstrapped CI (immediate active) 
quantile(del_r, probs = c(0.025, 0.975))*100 # Bootstrapped CI (delayed registered)
quantile(del_a, probs = c(0.025, 0.975))*100 # Bootstrapped CI (delayed active) 

hist(imm_r) # Distribution of Bootstrapped statistic (immediate registered)
hist(imm_a) # Distribution of Bootstrapped statistic (immediate active)
hist(del_r) # Distribution of Bootstrapped statistic (delayed registered)
hist(del_a) # Distribution of Bootstrapped statistic (delayed active)

save.image("Analyses.RData") # Save workspace

###### Chi-Square tests : ######
# Registration: 
(chisq.test(combined_resp_2021$insights, combined_resp_2021$registered)) # Association between insights and registration (n.s. -> no R-indicator)

# Activity: 
(chisq.test(combined_resp_2021$insights, combined_resp_2021$active)) # Association between insights and activity  (n.s. -> no R-indicator)

###### Survival analysis: ######
# Kaplan-Meier: 
ggsurvplot( # plot kaplan-meier curve 
  
  # add survival object 
  fit = survfit(Surv(ndays_rounded, dropout) ~ insights, data = combined_active_2021), 
  
  # change axes (titles and limits)
  xlim = c(0,15),
  break.x.by = 1,
  xlab = "Days", 
  ylab = "Probability of remaining in the study", 
  
  # Change legend
  legend = "bottom",legend.title = "", 
  legend.labs = c("Delayed (with CI)", "Immediate (with CI)"),
  
  # Add CI 
  conf.int = TRUE,
  
  # Change colors
  palette = c("#E7B800", "#2E9FDF"),
  
  # Remove censor points 
  censor = FALSE)

# Log-Rank Test: 
lr_insights <- survdiff(Surv(combined_active_2021$ndays_rounded, combined_active_2021$dropout) ~ combined_active_2021$insights) # log rank test

(lr_insights$obs/lr_insights$n)*100 # Calculate observed dropout rate
(lr_insights$exp/lr_insights$n)*100 # Calculate expected dropout rate 
1-pchisq(lr_insights$chisq, length(lr_insights$n) -1) # Get p value  
lr_insights$chisq # Get chi-square 

obs_i <- NULL # Storage for Bootstrap (observed immediate)
exp_i <- NULL # Storage for Bootstrap (expected immediate)
obs_d <- NULL # Storage for Bootstrap (observed delayed)
exp_d <- NULL # Storage for Bootstrap (expected delayed)
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  
  # Create resampled dataset from the combined dataset: 
  sample_d = combined_active_2021[sample(1:nrow(combined_active_2021), nrow(combined_active_2021), replace = TRUE), ]
  
  # Calculate observed and expected dropout in Bootstrapped samples: 
  model_bootstrap <- survdiff(Surv(sample_d$ndays_rounded, sample_d$dropout) ~ sample_d$insights)
  
  # Save the results: 
  obs_i <- c(obs_i, (model_bootstrap$obs[2]/model_bootstrap$n[2])) # observed immediate
  obs_d <- c(obs_d, (model_bootstrap$obs[1]/model_bootstrap$n[1])) # observed delayed 
  exp_i <- c(exp_i, (model_bootstrap$exp[2]/model_bootstrap$n[2])) # expected immediate 
  exp_d <- c(exp_d, (model_bootstrap$exp[1]/model_bootstrap$n[1])) # expected delayed
}

mean(obs_i)*100 # Bootstrapped mean (observed immediate) 
mean(obs_d)*100 # Bootstrapped mean (observed delayed) 
mean(exp_i)*100 # Bootstrapped mean (expected immediate) 
mean(exp_d)*100 # Bootstrapped mean (expected delayed) 

quantile(obs_i, probs = c(0.025, 0.975))*100 # Bootstrapped CI (observed immediate)
quantile(obs_d, probs = c(0.025, 0.975))*100 # Bootstrapped CI (observed delayed) 
quantile(exp_i, probs = c(0.025, 0.975))*100 # Bootstrapped CI (expected immediate) 
quantile(exp_d, probs = c(0.025, 0.975))*100 # Bootstrapped CI (expected delayed)

hist(obs_i) # Distribution of Bootstrapped statistic (observed immediate)
hist(obs_d) # Distribution of Bootstrapped statistic (observed delayed)
hist(exp_i) # Distribution of Bootstrapped statistic (expected immediate)
hist(exp_d) # Distribution of Bootstrapped statistic (expected delayed)

save.image("Analyses.RData") # Save workspace

#### Effects of Contact Mode on Registration, Activity, and Dropout (RQ 3) ####
# Subset interviewer and mail groups 
nl_resp_2021_f2f <- subset(nl_resp_2021, insampleF2F == TRUE) # Subset full face-to-face sample (full sample)
nl_active_2021_f2f <- subset(nl_active_2021, insampleF2F == TRUE) # Subset full face-to-face sample (active households)
interviewer_f2f <- subset(nl_resp_2021_f2f, mode == 1) # Subset households with interviewer (full f2f) 
interviewer_f2f <- as.data.frame(interviewer_f2f) # Make sure it is a dataframe 
mail_f2f <- subset(nl_resp_2021_f2f, mode == 0) # Subset households with mail (full f2f)
mail_f2f <- as.data.frame(mail_f2f) # Make sure it is a dataframe 

##### Full face to face sample: #####
###### Response rates per condition: ######
# Registration rate per condition: 
(mean(interviewer_f2f$registered))*100 # Registration rate in the interviewer group
(mean(mail_f2f$registered))*100 # Registration rate in the mail group 

# Activity rate per condition: 
(mean(interviewer_f2f$active))*100 # Activity rate in the interviewer group 
(mean(mail_f2f$active))*100 # Activity rate in the mail group 

# Bootstrap: 
int_r_f2f <- NULL # Storage (registration interviewer)
int_a_f2f <- NULL # Storage (activity interviewer)
mail_r_f2f <- NULL # Storage (registration mail)
mail_a_f2f <- NULL # Storage (activity mail)
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  
  # Create resampled dataset from the 2021 NL + ES f2f dataset: 
  sample_d = nl_resp_2021_f2f[sample(1:nrow(nl_resp_2021_f2f), nrow(nl_resp_2021_f2f), replace = TRUE), ]
  sample_d1 <- subset(sample_d, mode == 1) # interviewer
  sample_d2 <- subset(sample_d, mode == 0) # mail 
  
  # calculate registration and activity rates in Bootstrapped samples:  
  model_bootstrap1 <- (mean(sample_d1$registered)) # registration interviewer 
  model_bootstrap2 <- (mean(sample_d1$active)) # activity interviewer 
  
  model_bootstrap3 <- (mean(sample_d2$registered)) # registration mail 
  model_bootstrap4 <- (mean(sample_d2$active)) # active mail
  
  # Save the results 
  int_r_f2f <- c(int_r_f2f, model_bootstrap1) # registration interviewer 
  int_a_f2f <- c(int_a_f2f, model_bootstrap2) # activity interviewer
  mail_r_f2f <- c(mail_r_f2f, model_bootstrap3) # registration mail 
  mail_a_f2f <- c(mail_a_f2f, model_bootstrap4) # activity mail 
}

mean(int_r_f2f)*100 # Bootstrapped mean (registration interviewer) 
mean(int_a_f2f)*100 # Bootstrapped mean (activity interviewer) 
mean(mail_r_f2f)*100 # Bootstrapped mean (registration mail)
mean(mail_a_f2f)*100 # Bootstrapped mean (activity mail) 

quantile(int_r_f2f, probs = c(0.025, 0.975))*100 # Bootstrapped CI (registration interviewer) 
quantile(int_a_f2f, probs = c(0.025, 0.975))*100 # Bootstrapped CI (activity interviewer) 
quantile(mail_r_f2f, probs = c(0.025, 0.975))*100 # Bootstrapped CI (registration mail) 
quantile(mail_a_f2f, probs = c(0.025, 0.975))*100 # Bootstrapped CI (activity mail) 

hist(int_r_f2f) # Distribution of Bootstrapped statistic (registration interviewer)
hist(int_a_f2f) # Distribution of Bootstrapped statistic (activity interviewer)
hist(mail_r_f2f) # Distribution of Bootstrapped statistic (registration mail)
hist(mail_a_f2f) # Distribution of Bootstrapped statistic (activity mail)

save.image("Analyses.RData") # Save workspace

###### Chi-Square tests : #######
# Registration: 
(chisq.test(nl_resp_2021_f2f$mode, nl_resp_2021_f2f$registered)) # Association between contact mode and registration (s. -> calculate R-indicator)

ir_int_f2f <- getRIndicator(formula = registered ~ age + hhsize + origin + edulev + urban + homeowner + income, # Formulate response model
                            sampleData = interviewer_f2f, # Use interviewer dataset
                            sampleWeights = rep(3000, nrow(interviewer_f2f)), # Set sample weights
                            withPartials = TRUE) # Ask for partial indicators

ir_int_f2f$RUnadj # R-indicator 
ir_int_f2f$CVUnadj # Coefficient of variation 
ir_int_f2f$partialR$byVariables # Unconditional partial indicators (PuUnadj) and conditional partial indicators (PcUnadj)

R_int_r <- NULL # Storage for Bootstrap, R indicator for registration in interviewer group 
set.seed(2022) # Set seed
for(r in 1:R){ #Bootstrap 
  
  # Create resampled dataset from the interviewer group: 
  sample_d = interviewer_f2f[sample(1:nrow(interviewer_f2f), nrow(interviewer_f2f), replace = TRUE), ]
  
  # Calculate R indicator in Bootstrapped samples: 
  Model_bootstrap <- getRIndicator(formula = registered ~ age + hhsize + origin + edulev + urban + homeowner + income,
                                   sampleData = sample_d, 
                                   sampleWeights = rep(3000, nrow(sample_d)))
  
  R_Bootstrap <- Model_bootstrap$RUnadj
  
  # Save the results: 
  R_int_r <- c(R_int_r, R_Bootstrap)
}

mean(R_int_r) # Bootstrapped mean
quantile(R_int_r, probs = c(0.025, 0.975)) # Bootstrapped CI
hist(R_int_r) # Distribution of Bootstrapped statistic

save.image("Analyses.RData") # Save workspace

ir_mail_f2f <- getRIndicator(formula = registered ~ age + hhsize + origin + edulev + urban + homeowner + income, # Formulate response model
                             sampleData = mail_f2f, # Use mail dataset
                             sampleWeights = rep(3000, nrow(mail_f2f)), # Set sample weights
                             withPartials = TRUE) # Ask for partial indicators

ir_mail_f2f$RUnadj # R-indicator 
ir_mail_f2f$CVUnadj # Coefficient of variation 
ir_mail_f2f$partialR$byVariables # Unconditional partial indicators (PuUnadj) and conditional partial indicators (PcUnadj)

R_mail_r <- NULL # Storage for Bootstrap, R indicator for registration in mail group 
set.seed(2022) # Set seed
for(r in 1:R){ #Bootstrap 
  
  # Create resampled dataset from the mail group: 
  sample_d = mail_f2f[sample(1:nrow(mail_f2f), nrow(mail_f2f), replace = TRUE), ]
  
  # Calculate R indicator in Bootstrapped samples: 
  Model_bootstrap <- getRIndicator(formula = registered ~ age + hhsize + origin + edulev + urban + homeowner + income,
                                   sampleData = sample_d, 
                                   sampleWeights = rep(3000, nrow(sample_d)))
  
  R_Bootstrap <- Model_bootstrap$RUnadj
  
  # Save the results: 
  R_mail_r <- c(R_mail_r, R_Bootstrap)
}

mean(R_mail_r) # Bootstrapped mean 
quantile(R_mail_r, probs = c(0.025, 0.975)) # Bootstrapped CI 
hist(R_mail_r) # Distribution of Bootstrapped statistic

save.image("Analyses.RData") # Save workspace

# Activity: 
(chisq.test(nl_resp_2021_f2f$mode, nl_resp_2021_f2f$active)) # Association between contact mode and activity (s. -> calculate R-indicator)

ia_int_f2f <- getRIndicator(formula = active ~ age + hhsize + origin + edulev + urban + homeowner + income, # Formulate response model
                            sampleData = interviewer_f2f, # Use interviewer dataset
                            sampleWeights = rep(3000, nrow(interviewer_f2f)), # Set sample weights
                            withPartials = TRUE) # Ask for partial indicators

ia_int_f2f$RUnadj # R-indicator 
ia_int_f2f$CVUnadj # Coefficient of variation 
ia_int_f2f$partialR$byVariables # Unconditional partial indicators (PuUnadj) and conditional partial indicators (PcUnadj)

R_int_a <- NULL # Storage for Bootstrap, R indicator for activity in interviewer group 
set.seed(2022) # Set seed
for(r in 1:R){ #Bootstrap 
  
  # Create resampled dataset from the interviewer group: 
  sample_d = interviewer_f2f[sample(1:nrow(interviewer_f2f), nrow(interviewer_f2f), replace = TRUE), ]
  
  # Calculate R indicator in Bootstrapped samples: 
  Model_bootstrap <- getRIndicator(formula = active ~ age + hhsize + origin + edulev + urban + homeowner + income,
                                   sampleData = sample_d, 
                                   sampleWeights = rep(3000, nrow(sample_d)))
  
  R_Bootstrap <- Model_bootstrap$RUnadj
  
  # Save the results: 
  R_int_a <- c(R_int_a, R_Bootstrap)
}

mean(R_int_a) # Bootstrapped mean 
quantile(R_int_a, probs = c(0.025, 0.975)) # Bootstrapped CI 
hist(R_int_a) # Distribution of Bootstrapped statistic

save.image("Analyses.RData") # Save workspace

ia_mail_f2f <- getRIndicator(formula = active ~ age + hhsize + origin + edulev + urban + homeowner + income, # Formulate response model
                             sampleData = mail_f2f, # Use mail dataset
                             sampleWeights = rep(3000, nrow(mail_f2f)), # Set sample weights
                             withPartials = TRUE) # Ask for partial indicators

ia_mail_f2f$RUnadj # R-indicator 
ia_mail_f2f$CVUnadj # Coefficient of variation 
ia_mail_f2f$partialR$byVariables # Unconditional partial indicators (PuUnadj) and conditional partial indicators (PcUnadj)

R_mail_a <- NULL # Storage for Bootstrap, R indicator for activity in mail group 
set.seed(2022) # Set seed
for(r in 1:R){ #Bootstrap 
  
  # Create resampled dataset from the mail group: 
  sample_d = mail_f2f[sample(1:nrow(mail_f2f), nrow(mail_f2f), replace = TRUE), ]
  
  # Calculate R indicator in Bootstrapped samples: 
  Model_bootstrap <- getRIndicator(formula = active ~ age + hhsize + origin + edulev + urban + homeowner + income,
                                   sampleData = sample_d, 
                                   sampleWeights = rep(3000, nrow(sample_d)))
  
  R_Bootstrap <- Model_bootstrap$RUnadj
  
  # Save the results: 
  R_mail_a <- c(R_mail_a, R_Bootstrap)
}

mean(R_mail_a) # Bootstrapped mean 
quantile(R_mail_a, probs = c(0.025, 0.975)) # Bootstrapped CI 
hist(R_mail_a) # Distribution of Bootstrapped statistic

save.image("Analyses.RData") # Save workspace



###### Survival analysis: ######
# Kaplan-meier: 
ggsurvplot( # plot kaplan-meier curve 
  
  # add survival object 
  fit = survfit(Surv(ndays_rounded, dropout) ~ mode, data = nl_active_2021_f2f), 
  
  # change axes (titles and limits)
  xlim = c(0,15),
  break.x.by = 1,
  xlab = "Days", 
  ylab = "Probability of remaining in the study", 
  
  # Change legend
  legend = "bottom",legend.title = "", 
  legend.labs = c("Mail (with CI)", "Interviewer (with CI)"),
  
  # Add CI 
  conf.int = TRUE,
  
  # Change colors
  palette = c("#E7B800", "#2E9FDF"),
  
  # Remove censor points 
  censor = FALSE)

# Log-rank: 
(lr_mode_f2f <- survdiff(Surv(nl_active_2021_f2f$ndays_rounded, nl_active_2021_f2f$dropout) ~ nl_active_2021_f2f$mode)) # Log rank test

(lr_mode_f2f$obs/lr_mode_f2f$n)*100 # Calculate observed dropout rate
(lr_mode_f2f$exp/lr_mode_f2f$n)*100 # Calculate expected dropout rate
1-pchisq(lr_mode_f2f$chisq, length(lr_mode_f2f$n) -1) # Get p value 
lr_mode_f2f$chisq # Get chi-square 

obs_int_f2f <- NULL # Storage (observed interviewer)
exp_int_f2f <- NULL # Storage (expected interviewer)
obs_mail_f2f <- NULL # Storage (observed mail)
exp_mail_f2f <- NULL # Storage (expected mail)
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  
  # Create resampled dataset from the 2021 NL + ES f2f dataset: 
  sample_d = nl_active_2021_f2f[sample(1:nrow(nl_active_2021_f2f), nrow(nl_active_2021_f2f), replace = TRUE), ]
  
  # Calculate registration rate in Bootstrapped samples: 
  model_bootstrap <- survdiff(Surv(sample_d$ndays_rounded, sample_d$dropout) ~ sample_d$mode)
  
  # Save the results: 
  obs_int_f2f <- c(obs_int_f2f, (model_bootstrap$obs[2]/model_bootstrap$n[2])) # observed interviewer
  obs_mail_f2f <- c(obs_mail_f2f, (model_bootstrap$obs[1]/model_bootstrap$n[1])) # expected interviewer
  exp_int_f2f <- c(exp_int_f2f, (model_bootstrap$exp[2]/model_bootstrap$n[2])) # observed mail 
  exp_mail_f2f <- c(exp_mail_f2f, (model_bootstrap$exp[1]/model_bootstrap$n[1])) # expected mail 
}

mean(obs_int_f2f)*100 # Bootstrapped mean (observed interviewer) 
mean(obs_mail_f2f)*100 # Bootstrapped mean (observed mail) 
mean(exp_int_f2f)*100 # Bootstrapped mean (expected interviewer) 
mean(exp_mail_f2f)*100 # Bootstrapped mean (expected mail)

quantile(obs_int_f2f, probs = c(0.025, 0.975))*100 # Bootstrapped CI (observed interviewer)
quantile(obs_mail_f2f, probs = c(0.025, 0.975))*100 # Bootstrapped CI (observed mail)
quantile(exp_int_f2f, probs = c(0.025, 0.975))*100 # Bootstrapped CI (expected interviewer)
quantile(exp_mail_f2f, probs = c(0.025, 0.975))*100 # Bootstrapped CI (expected mail)

hist(obs_int_f2f) # Distribution of Bootstrapped statistic (observed interviewer)
hist(obs_mail_f2f) # Distribution of Bootstrapped statistic (observed mail)
hist(exp_int_f2f) # Distribution of Bootstrapped statistic (expected interviewer)
hist(exp_mail_f2f) # Distribution of Bootstrapped statistic (expected mail)

save.image("Analyses.RData") # Save workspace

##### 2020 Web-based #####
# Entries
wtd.mean(nl_resp_2020_2$entries, nl_resp_2020_2$iwPU, na.rm = TRUE) # Weighted mean
wtd.quantile(nl_resp_2020_2$entries, nl_resp_2020_2$iwPU, probs = c(0.25, .5, 0.75, 1), na.rm = TRUE) # Weighted quartiles 
sqrt(wtd.var(nl_resp_2020_2$entries, nl_resp_2020_2$iwPU, na.rm = TRUE)) # Weighted standard deviation

# Amount range
wtd.mean(nl_resp_2020_2$amountvar_w, nl_resp_2020_2$iwPU, na.rm = TRUE) # Weighted mean
wtd.quantile(nl_resp_2020_2$amountvar_w, nl_resp_2020_2$iwPU, probs = c(0.25, .5, 0.75, 1), na.rm = TRUE) # Weighted quartiles 
sqrt(wtd.var(nl_resp_2020_2$amountvar_w, nl_resp_2020_2$iwPU, na.rm = TRUE)) # Weighted standard deviations

# SD amount
wtd.mean(nl_resp_2020_2$amountsd_w, nl_resp_2020_2$iwPU, na.rm = TRUE) # Weighted mean
wtd.quantile(nl_resp_2020_2$amountsd_w, nl_resp_2020_2$iwPU, probs = c(0.25, .5, 0.75, 1), na.rm = TRUE) # Weighted quartiles
sqrt(wtd.var(nl_resp_2020_2$amountsd_w, nl_resp_2020_2$iwPU, na.rm = TRUE)) # Weighted standard deviations 

# Products range 
wtd.mean(nl_resp_2020_2$nprodvar_w, nl_resp_2020_2$iwPU, na.rm = TRUE) # Weighted mean
wtd.quantile(nl_resp_2020_2$nprodvar_w, nl_resp_2020_2$iwPU, probs = c(0.25, .5, 0.75, 1), na.rm = TRUE) # Weighted quartiles 
sqrt(wtd.var(nl_resp_2020_2$nprodvar_w, nl_resp_2020_2$iwPU, na.rm = TRUE)) # Weighted standard devation

# SD products 
wtd.mean(nl_resp_2020_2$nprodsd_w, nl_resp_2020_2$iwPU, na.rm = TRUE) # Weighted mean 
wtd.quantile(nl_resp_2020_2$nprodsd_w, nl_resp_2020_2$iwPU, probs = c(0.25, .5, 0.75, 1), na.rm = TRUE) # Weighted quartiles 
sqrt(wtd.var(nl_resp_2020_2$nprodsd_w, nl_resp_2020_2$iwPU, na.rm = TRUE)) # Weighted standard deviation

# Store types 
wtd.mean(nl_resp_2020_2$ncat, nl_resp_2020_2$iwPU, na.rm = TRUE) # Weighted mean 
wtd.quantile(nl_resp_2020_2$ncat, nl_resp_2020_2$iwPU, probs = c(0.25, .5, 0.75, 1), na.rm = TRUE) # Weighted quartiles 
sqrt(wtd.var(nl_resp_2020_2$ncat, nl_resp_2020_2$iwPU, na.rm = TRUE)) # Weighted standard deviations

# Bootstrap
nl2020_entry_mean <- NULL # Storage (entries)
nl2020_amountvar_mean <- NULL # Storage (amount range)
nl2020_amountsd_mean <- NULL # Storage (amount sd)
nl2020_nprodvar_mean <- NULL # Storage (products range)
nl2020_nprodsd_mean <- NULL # Storage (products sd)
nl2020_ncat_mean <- NULL # Storage (store types)

set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  
  # Create resampled dataset from the 2020 dataset: 
  sample_d = nl_resp_2020_2[sample(1:nrow(nl_resp_2020_2), nrow(nl_resp_2020_2), replace = TRUE), ]
  
  # Calculate indicators in Bootstrapped samples:  
  model_bootstrap1 <- wtd.mean(sample_d$entries, sample_d$iwPU, na.rm = TRUE) # Entries
  model_bootstrap3 <- wtd.mean(sample_d$amountvar_w, sample_d$iwPU, na.rm = TRUE) # Amount range
  model_bootstrap5 <- wtd.mean(sample_d$amountsd_w, sample_d$iwPU, na.rm = TRUE) # Amount sd
  model_bootstrap7 <- wtd.mean(sample_d$nprodvar_w, sample_d$iwPU, na.rm = TRUE) # Products range
  model_bootstrap9 <- wtd.mean(sample_d$nprodsd_w, sample_d$iwPU, na.rm = TRUE) # Products sd
  model_bootstrap11 <- wtd.mean(sample_d$ncat, sample_d$iwPU, na.rm = TRUE) # Store types
  
  # Save the results 
  nl2020_entry_mean <- c(nl2020_entry_mean, model_bootstrap1) # Entries
  nl2020_amountvar_mean <- c(nl2020_amountvar_mean, model_bootstrap3) # Amount range
  nl2020_amountsd_mean <- c(nl2020_amountsd_mean, model_bootstrap5) # Amount sd
  nl2020_nprodvar_mean <- c(nl2020_nprodvar_mean, model_bootstrap7) # Products range
  nl2020_nprodsd_mean <- c(nl2020_nprodsd_mean, model_bootstrap9) # Products sd
  nl2020_ncat_mean <- c(nl2020_ncat_mean, model_bootstrap11) # Store types
  
}

mean(nl2020_entry_mean) # Bootstrapped mean (entries) 
mean(nl2020_amountvar_mean) # Bootstrapped mean (amount range) 
mean(nl2020_amountsd_mean)  # Bootstrapped mean (amount sd) 
mean(nl2020_nprodvar_mean) # Bootstrapped mean (products range) 
mean(nl2020_nprodsd_mean) # Bootstrapped mean (products sd)  
mean(nl2020_ncat_mean) # Bootstrapped mean (store types) 

quantile(nl2020_entry_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (entries)  
quantile(nl2020_amountvar_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (amount range) 
quantile(nl2020_amountsd_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (amount sd) 
quantile(nl2020_nprodvar_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (products range)  
quantile(nl2020_nprodsd_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (products sd)  
quantile(nl2020_ncat_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (store types) 

hist(nl2020_entry_mean) # Distribution of Bootstrapped statistic (entries)
hist(nl2020_amountvar_mean) # Distribution of Bootstrapped statistic (amount range)
hist(nl2020_amountsd_mean) # Distribution of Bootstrapped statistic (amount sd)
hist(nl2020_nprodvar_mean) # Distribution of Bootstrapped statistic (products range)
hist(nl2020_nprodsd_mean) # Distribution of Bootstrapped statistic (products sd)
hist(nl2020_ncat_mean) # Distribution of Bootstrapped statistic (store types)

##### NL 2021 #####
# Entries 
summary(nl_active_2021$entriesw1, na.rm = T) # Mean and quartiles 
sd(nl_active_2021$entriesw1, na.rm = T) # Standard deviation

# Amount range
summary(nl_active_2021$amountvarw1, na.rm = T) # Mean and quartiles 
sd(nl_active_2021$amountvarw1, na.rm = T) # Standard deviation 

# SD amount
summary(nl_active_2021$sdamountw1, na.rm = T) # Mean and quartiles
sd(nl_active_2021$sdamountw1, na.rm = T) # Standard deviation 

# Products range
summary(nl_active_2021$nprodvarw1, na.rm = T) # Mean and quartiles
sd(nl_active_2021$nprodvarw1, na.rm = T) # Standard deviation 

# SD products
summary(nl_active_2021$sdnprodw1, na.rm = T) # Mean and quartiles 
sd(nl_active_2021$sdnprodw1, na.rm = T) # Standard deviation 

# Store types
summary(nl_active_2021$ncatw1, na.rm = T) # Mean and quartiles
sd(nl_active_2021$ncatw1, na.rm = T) # Standard deviation 

# Bootstrap
nl_entry_mean <- NULL # Storage (entries)
nl_amountvar_mean <- NULL # Storage (amount range)
nl_amountsd_mean <- NULL # Storage (amount sd)
nl_nprodvar_mean <- NULL # Storage (products range)
nl_nprodsd_mean <- NULL # Storage (products sd)
nl_ncat_mean <- NULL # Storage (store types)
set.seed(2022) # Set seed
for(r in 1:R){ # Bootstrap
  
  # Create resampled dataset from the 2021 NL dataset: 
  sample_d = nl_active_2021[sample(1:nrow(nl_active_2021), nrow(nl_active_2021), replace = TRUE), ]
  
  # Calculate statistics in Bootstrapped samples:  
  model_bootstrap1 <- mean(sample_d$entriesw1, na.rm = T) # Entries
  model_bootstrap3 <- mean(sample_d$amountvarw1, na.rm = T) # Amount range
  model_bootstrap5 <- mean(sample_d$sdamountw1, na.rm = T) # Amount sd
  model_bootstrap7 <- mean(sample_d$nprodvarw1, na.rm = T) # Products range
  model_bootstrap9 <- mean(sample_d$sdnprodw1, na.rm = T) # Products sd
  model_bootstrap11 <- mean(sample_d$ncatw1, na.rm = T) # Store types
  
  # Save the results:  
  nl_entry_mean <- c(nl_entry_mean, model_bootstrap1) # Entries
  nl_amountvar_mean <- c(nl_amountvar_mean, model_bootstrap3) # Amount range
  nl_amountsd_mean <- c(nl_amountsd_mean, model_bootstrap5) # Amount sd
  nl_nprodvar_mean <- c(nl_nprodvar_mean, model_bootstrap7) # Products range
  nl_nprodsd_mean <- c(nl_nprodsd_mean, model_bootstrap9) # Products sd
  nl_ncat_mean <- c(nl_ncat_mean, model_bootstrap11) # Store types
}

mean(nl_entry_mean) # Bootstrapped mean (entries)  
mean(nl_amountvar_mean) # Bootstrapped mean (amount range)
mean(nl_amountsd_mean) # Bootstrapped mean (amount sd)
mean(nl_nprodvar_mean) # Bootstrapped mean (products range)
mean(nl_nprodsd_mean) # Bootstrapped mean (products sd)
mean(nl_ncat_mean) # Bootstrapped mean (store types) 

quantile(nl_entry_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (entries) 
quantile(nl_amountvar_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (amount range)
quantile(nl_amountsd_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (amount sd) 
quantile(nl_nprodvar_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (products range)   
quantile(nl_nprodsd_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (products sd)  
quantile(nl_ncat_mean, probs = c(0.025, 0.975)) # Bootstrapped CI (store types)

hist(nl_entry_mean) # Distribution of Bootstrapped statistic (entries)
hist(nl_amountvar_mean) # Distribution of Bootstrapped statistic (amount range)
hist(nl_amountsd_mean) # Distribution of Bootstrapped statistic (amount sd)
hist(nl_nprodvar_mean) # Distribution of Bootstrapped statistic (products range)
hist(nl_nprodsd_mean) # Distribution of Bootstrapped statistic (products sd)
hist(nl_ncat_mean) # Distribution of Bootstrapped statistic (store types)

