#--- 7751_regressions.R --- [R] code for reproducing results of "Investigating
#--- selection bias of online surveys on coronavirus-related behavioral
#--- outcomes"
#
# Title: 
# Filename: 7751_regressions.R
# Description: 
# Author: Bernd Weiss
# Maintainer: Bernd Weiss
# DOI: 
# Created: Fr Mai 29 09:18:56 2020 (+0200)
# Last-Updated: Fr Mai 29 09:32:50 2020 (+0200)
#           By: weissbd
#     Update #: 5
# Version: 
# Dependencies: [Software (Version), Software (Version)...)
# Depends on: [File1, file2, ...] 
# Data source: [name of dataset] (DOI) 
# URL: 
# Keywords: 
# Licence: Attribution 4.0 International (CC BY 4.0) 
# Licence-URL: https://creativecommons.org/licenses/by/4.0/ 
# Compatibility: 
# 
################################################################################
# 
## Commentary: 
# 
# This is [R|Stata|...] code can give insights into statistical analyses 
# conducted in the following paper :
# 
# "Investigating selection bias of online surveys on coronavirus-related
# behavioral outcomes".
# 
# Other code files may need to be executed first or this code file will be 
# called by another code file that hopefully is listed under "Depends on:".
# 
# Due to [German|...] data protection laws we may be unable to provide the 
# data. In that case you may be unable to reproduce our statistical analyses
# yourself. However, you might be able to obtain the data from [http://].
# 
################################################################################
# 
# 
# 
################################################################################
# 
## Change Log:
# 
# 
################################################################################
# 
## Code:




## -------------------------------------------------------------------------- ##
## +++ Some additionl data preparation steps                                  ##
## -------------------------------------------------------------------------- ##


## Assign labels to all Y-, coronavirus-related outcomes.
attr(dfx$hzcy006a, "label") <- "Measures taken: Avoided places"
attr(dfx$hzcy007a, "label") <- "Measures taken: Kept distance"
attr(dfx$hzcy008a, "label") <- "Measures taken: Adapted school or work situation"
attr(dfx$hzcy009a, "label") <- "Measures taken: Quarantine due to symptoms"
attr(dfx$hzcy010a, "label") <- "Measures taken: Quarantine without symptoms"
attr(dfx$hzcy011a, "label") <- "Measures taken: Washed hands more often"
attr(dfx$hzcy012a, "label") <- "Measures taken: Used disinfectant"
attr(dfx$hzcy013a, "label") <- "Measures taken: Stocks increased"
attr(dfx$hzcy014a, "label") <- "Measures taken: Reduced social interactions"
attr(dfx$hzcy015a, "label") <- "Measures taken: Worn face mask"                     


## Note: In the following we speak of Y, L, and Z variables. This notation
## refers to Figure 1 in our paper. 

## Y variables: Measures to minimize risk (hzcy006a:hzcy018a) and online
##              participation.
yvars <- c(paste0("hzcy0", sprintf('%0.2d', 6:15), "a"), ## already dichotom
           "parthzgf")                                   ## R_y: Internet yes/no
yvars
head(dfx[, yvars])


## L variables: Big Five (gdze001a-gdze010a).
lvars <- c(grep("^bfi_*", names(dfx), value = TRUE))
lvars
head(dfx[, lvars])


## Z variables: age, education_cat, sex.
zvars_all <- c("female", "education_cat", "age")
zvars_all_fml <- str_flatten(zvars_all, collapse = " + ")
zvars_all_fml




## -------------------------------------------------------------------------- ##
## +++ Identify relevant L (predictors) and Y (corona-related) variables      ##
## -------------------------------------------------------------------------- ##


## Create matrix of variable names of all combinations of Y, L, Z variables.
ylzvars_mv <- expand.grid(yvars, lvars, zvars_all_fml)
names(ylzvars_mv) <- c("y", "l", "z_all")
dim(ylzvars_mv)
head(ylzvars_mv)
nregressions_mv <- nrow(ylzvars_mv)


## Based on this matrix of all Y, L, Z variable combinations, create a new
## column "mfamily", which, based on the number of scale points of Y, is either
## binomial (nscalepoints <= 2) or gaussian (everything else). This is
## accomplished by creating a table, and then the number of categories is
## counted.
ylzvars_mv %>% rowwise() %>% mutate(nscalepoints =
                                        length(table(dfx[as.character(y)]))) %>%
    mutate(mfamily = ifelse(nscalepoints <= 2,
                            "binomial",
                            "gaussian")) -> ylzvars_mv 


## Convert factor vectors into character vectors, which is easier to deal with
## in pmap() (see below).
ylzvars_mv$y <- as.character(ylzvars_mv$y)
ylzvars_mv$l <- as.character(ylzvars_mv$l)
ylzvars_mv$z <- as.character(ylzvars_mv$z_all)


# Actually run all regressions and save to ylzvars_mv_regs; the regressions
# objects are stored in a column called "model".
ylzvars_mv %>% select(-nscalepoints) %>%
    mutate(model = pmap(list(y, l, z, mfamily),
                        create_ylz_formula)) -> ylzvars_mv_regs


## Extract coefficients, SEs etc. from each model and store in a list. 
ylzvars_mv_regs_tidied <- map(ylzvars_mv_regs$model, broom::tidy)


## Create a new column with list elements in ylzvars_regs that contains the
## extracted coefficients. 
ylzvars_mv_regs$mtidied <- ylzvars_mv_regs_tidied


## Expand list elements (each list contains six rows: intercept, l and z
## coeff), hence the number of rows should increase by a factor of 5. 
ylzvars_mv_regs %>% unnest(mtidied) %>% select(-model) -> ylzvars_mv_regs_coeff
ylzvars_mv_regs_coeff %>% head %>% data.frame
dim(ylzvars_mv_regs)
dim(ylzvars_mv_regs_coeff)
nrow(ylzvars_mv_regs_coeff) / nrow(ylzvars_mv_regs)


## Add variable labels and remove rowwise() by ungroup().
ylzvars_mv_regs_coeff %>%
    ungroup() %>%
    ## Since all BFI variables are of type logic, the actual term is
    ## "bfi_imageTRUE", which cannot be parsed, hence, "TRUE" is removed.
    mutate(term = str_replace(term, "TRUE", "")) %>%
    ## Remove intercept rows.
    filter(term != "(Intercept)") %>%
    ## Remove all Z variables ("control variables"), keep only L variables.
    filter(term %in% lvars) %>%
    mutate(ylab = get_varlabels(dfx, y)) %>%
    mutate(llab = get_varlabels(dfx, term)) %>%
    arrange(p.value, estimate) %>%
    select(y, ylab, term, llab, z, everything()) -> ylzvars_mv_regs_coeff_lab


## Now, the goal ist to identify those L variables that are statistical
## significant for R_y and Y. The idea is to simply split the overall
## tibble/data.frame in one part only containing parthzgf and another part only
## containing Y outcome variables and then merge rowwise.
ylzvars_mv_regs_coeff_lab %>% filter(y == "parthzgf") %>%
    ## Create an identifier that can be used in merge().
    mutate(lz = paste0(term, z)) -> tab_mv_coeff_internet 
data.frame(head(tab_mv_coeff_internet))

ylzvars_mv_regs_coeff_lab %>% filter(y != "parthzgf") %>%
    ## Create an identifier that can be used in merge().
    mutate(lz = paste0(term, z)) -> tab_mv_coeff_y
data.frame(head(tab_mv_coeff_y))

tab_mv_coeff_y %>% arrange(lz) %>% select(y, lz, estimate)



## Merge the "parthzgf" (online participation) and risk-minimizing measures
## part.
tab_mv_ry <- merge(tab_mv_coeff_internet, tab_mv_coeff_y, by = "lz") 
head(tab_mv_ry)
dim(tab_mv_ry)
dim(tab_mv_coeff_internet)
dim(tab_mv_coeff_y)


## Next, select all L variables that are statistically significant for R_y
## (online) and Y (corona-related variables). 
tab_mv_ry %>%
    filter((p.value.x < 0.05) & (p.value.y < 0.05)) %>%
    select(-starts_with("std.error"),
           -starts_with("statistic"),
           -starts_with("flag"),
           -lz) %>%
    arrange(estimate.x, estimate.y) %>%
    select(y.x, y.y, ylab.y, term.x, llab.x, everything()) -> tab_mv_ry_selected
dim(tab_mv_ry_selected)
head(tab_mv_ry_selected)
tab_mv_ry_selected


## Export selection of L and Y variables to HTML table.
tab_mv_ry_selected %>% mutate_if(is.numeric, round, 2) -> tab_aredbonf
saveWidget(datatable(tab_aredbonf, options = list(pageLength = 100000)),
           file = file.path(tabpath, "t_mv_sig_at_r-and-y.html"))


################################################################################
#--- 7751_regressions.R ends here
