# Modeling public opinion over time and space: Trust in state institutions in Europe, 1989-2019
# 
# 20 February 2023
#
# Replication materials: Harmonization


# Start ------

# path to replication materials
setwd("replication/")

# set path to the directory with data files
data_path <- "data/"


# Packages
library(haven) # for opening SPSS files
library(readstata13) # for opening Stata files
library(labelled) # for using labels
library(questionr) # for getting frequencies
library(rio) # for importing and exporting to different formats
library(tidyverse) # for manipulating data

# list of all survey variables by project
vars_all <- rio::import("1_harmonization/harmonization/vars_all.xlsx")



# ASES (Asia-Europe Survey) -----------

### SPECIAL ###
# In Ireland, "still at school" has a separate code. Needs converting into 
# (current) schooling years and education level based on age.

# read in the survey data
# select only the necessary variables
# V0231:V0248 are country-specific education variables

ases <- haven::read_sav(paste0(data_path, "ASES/22324-0001-Data.sav"), 
                        user_na = TRUE) %>% 
  select(unique(vars_all$src_var[vars_all$study == "ASES"]), V0231:V0248) %>%
  mutate(t_caseid_n = row_number())

# read in cross-walk table
cwt_ases <- rio::import("1_harmonization/cwt/ases_cwt.xlsx") %>%
  drop_na(target_var)

# recoding based on the cross-walk table

target_vars <- unique(cwt_ases$target_var)
source_vars <- unique(cwt_ases$var_name)
data_small <- ases %>% zap_labels()

harmonized_vars <- list()

for (i in 1:length(target_vars)) {
  
  target_var_input = target_vars[i]
  
  source <- cwt_ases %>% 
    filter(target_var == target_var_input) %>%
    pull(value_code)
  
  target <- cwt_ases %>% 
    filter(target_var == target_var_input) %>%
    pull(target_value)
  
  source_varname <- cwt_ases %>%
    filter(target_var == target_var_input) %>%
    pull(var_name) %>% .[1]
  
  harmonized_vars[[i]] <- data_small %>%
    transmute(!!target_var_input := plyr::mapvalues(as.character(get(source_varname)), 
                                                    source, target))
}

# apply final corrections
ases_harm <- cbind(data_small, harmonized_vars) %>%
  mutate(t_caseid = as.character(V0001),
         t_weight_despst = 1,
         t_year = 2000,
         t_wave = as.character(1),
         t_project = "ASES",
         t_educ = coalesce(!!! select(., starts_with("t_educ")))) %>%
  mutate_at(vars(starts_with("trust"), starts_with("t_educ"), t_age, t_female, t_educ, 
                 t_polint, t_satpolitics, t_satlife), as.numeric) %>%
  # IE, still at school, t_educ = 0
  mutate(t_educ = ifelse(t_educ == 0 & t_age < 18, 1, t_educ),
         t_educ = ifelse(t_educ == 0 & t_age >= 18 & t_age < 23, 2, t_educ),
         t_educ = ifelse(t_educ == 0 & t_age >= 23, 3, t_educ)) %>%
  select(starts_with("t_"), starts_with("trust"), -starts_with("t_educ_")) %>%
  sjlabelled::remove_all_labels()

# save
saveRDS(ases_harm, "1_harmonization/ASES.Rds")





# CCEB (Candidate Countries Eurobarometer) ------------

### SPECIAL ###
# Country-specific education variables are in separate variables named d3_*

# mapping file names to wave numbers
cceb_filenames <- data.frame(file_name = c("04056", "04107", "04350", "04054", "04062"),
                             file_name_full = c("04056-0001-Data.por", "04107-0001-Data.por", 
                                                "04350-0001-Data.por", "04054-0001-Data.sav", 
                                                "04062-0001-Data.sav"),
                             wave = c("2003.4", "2003.2", "2004.1", "2001.1", "2002.2"))

# read in the survey data

# path to folder with all CCEB data files
path <- paste0(data_path, "CCEB")
# list.files(path = path, pattern = "\\.por|sav$")
temp_sav <- list.files(path = path, pattern = ".sav$")
temp_por <- list.files(path = path, pattern = ".por$")

fsav <- file.path(path, temp_sav)
fpor <- file.path(path, temp_por)

cceb_sav <- lapply(fsav, haven::read_sav, user_na = TRUE)
names(cceb_sav) <- temp_sav

cceb_por <- lapply(fpor, haven::read_por, user_na = TRUE)
names(cceb_por) <- temp_por

for(i in temp_sav) { 
  cceb_sav[[i]]$source <- i
  var_label(cceb_sav[[i]]$source) <- substr(i, 1,5)
  cceb_sav[[i]] <- cceb_sav[[i]] %>% dplyr::select(source, everything())
}

for(i in temp_por) { 
  cceb_por[[i]]$source <- i
  var_label(cceb_por[[i]]$source) <- substr(i, 1,5)
  cceb_por[[i]] <- cceb_por[[i]] %>% dplyr::select(source, everything())
}

# list with full surveys
cceb_all <- append(cceb_por, cceb_sav)

# put the data frames in the list in the right order 
# (corresponding to the wave numbers)
cceb_all <- cceb_all[c("04054-0001-Data.sav","04062-0001-Data.sav", 
                       "04107-0001-Data.por", "04056-0001-Data.por", 
                       "04350-0001-Data.por")]
names(cceb_all) <- substr(names(cceb_all), 1, 5)

# get variable names from the vars_all table
vars_cceb <- vars_all %>% filter(study == "CCEB")

cceb_small <- cceb_all
for (i in names(cceb_small)) {
  
  wavenum <- cceb_filenames %>% 
    filter(file_name == i) %>% 
    pull(wave) %>% 
    as.character()
  
  variables <- vars_cceb %>% filter(wave == wavenum) %>% pull(src_var) %>% toupper()
  caseid_var <- vars_cceb %>% filter(wave == wavenum, target_var == "t_caseid") %>% pull(src_var) %>% toupper()
  weight_var <- vars_cceb %>% 
    filter(wave == i, target_var %in% c("t_weight_pst", "t_weight_despst")) %>% 
    pull(src_var)
  
  cceb_small[[i]] <- cceb_small[[i]] %>% 
    select(all_of(variables), starts_with("d13"), starts_with("d3")) %>%
    mutate(t_caseid = as.character(eval(parse(text = caseid_var))),
           t_caseid_n = row_number(),
           t_project = "CCEB",
           t_wave = wavenum,
           t_weight_pst = eval(parse(text = weight_var))) %>%
    zap_labels()
}

# read in cross-walk table
cwt_cceb <- rio::import("1_harmonization/cwt/cceb_cwt.xlsx") %>%
  drop_na(target_var) %>%
  split(., .$file_name)

# recoding based on the cross-walk table

# check if names in the data list and the cross-walk list match
cwt_cceb <- cwt_cceb[names(cceb_small)]
names(cwt_cceb)
names(cceb_all)

cceb_harm <- list()

for (j in 1:length(cwt_cceb)) {
  
  name_table <- names(cwt_cceb)[j]
  
  target_vars <- unique(cwt_cceb[[j]]$target_var)
  source_vars <- unique(cwt_cceb[[j]]$var_name)
  data_small <- cceb_small[[name_table]] %>% zap_labels()
  
  harmonized_vars <- list()
  
  for (i in 1:length(target_vars)) {
    
    target_var_input = target_vars[i]
    
    source <- cwt_cceb[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(value_code)
    
    target <- cwt_cceb[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(target_value)
    
    source_varname <- cwt_cceb[[j]] %>%
      filter(target_var == target_var_input) %>%
      pull(var_name) %>% .[1]
    
    harmonized_vars[[i]] <- data_small %>%
      transmute(!!target_var_input := plyr::mapvalues(as.character(get(source_varname)), 
                                                   source, target))
  }
  cceb_harm[[name_table]] <- cbind(data_small, harmonized_vars) %>%
    sjlabelled::remove_all_labels()
}
names(cceb_harm) <- names(cwt_cceb)

# apply final corrections
cceb_harm_df <- bind_rows(cceb_harm, .id = "file_name") %>%
  mutate_at(vars(starts_with("trust"), starts_with("t_educ"), t_ageedu, t_age, t_female,
                 t_weight_pst), as.numeric) %>%
  mutate(t_year = as.numeric(substr(t_wave, 1, 4)),
         t_educ = coalesce(!!! select(., starts_with("t_educ"))),
         t_ageedu = ifelse(t_ageedu == 0, t_age - 6, t_ageedu),
         t_ageedu = ifelse(t_ageedu < 6, NA, t_ageedu)) %>%
  select(starts_with("t_"), starts_with("trust"), -starts_with("t_educ_"))

# save
saveRDS(cceb_harm_df, "1_harmonization/CCEB.Rds")





# CDCEE (Consolidation of Democracy in Central and Eastern Europe) -----------

### SPECIAL ###
# country-specific weight variables: 
# t_weight = coalesce(V633, V634, V636, V637, V638, V639, V640)
# no weights for many countries: t_weight = ifelse(is.na(t_weight), 1, t_weight))

# get variable names from the vars_all table
vars_cdcee <- vars_all %>% filter(study == "CDCEE")

# read in the survey data
cdcee <- haven::read_sav(paste0(data_path, "CDCEE/ZA4054.sav"),
                         user_na = TRUE) %>%
  zap_labels() %>%
  mutate(t_weight = coalesce(V633, V634, V636, V637, V638, V639, V640),
         t_weight = ifelse(is.na(t_weight), 1, t_weight),
         t_caseid_n = row_number()) %>% 
  select(all_of(vars_cdcee$src_var[vars_cdcee$study == "CDCEE"]), 
         t_weight, t_caseid_n)

# read in cross-walk table
cwt_cdcee <- rio::import("1_harmonization/cwt/cdcee_cwt.xlsx") %>%
  drop_na(target_var)

# recoding based on the cross-walk table
target_vars <- unique(cwt_cdcee$target_var)
source_vars <- unique(cwt_cdcee$var_name)
data_small <- cdcee %>% zap_labels()

harmonized_vars <- list()

for (i in 1:length(target_vars)) {
  
  target_var_input = target_vars[i]
  
  source <- cwt_cdcee %>% 
    filter(target_var == target_var_input) %>%
    pull(value_code)
  
  target <- cwt_cdcee %>% 
    filter(target_var == target_var_input) %>%
    pull(target_value)
  
  source_varname <- cwt_cdcee %>%
    filter(target_var == target_var_input) %>%
    pull(var_name) %>% .[1]
  
  harmonized_vars[[i]] <- data_small %>%
    transmute(!!target_var_input := plyr::mapvalues(as.character(get(source_varname)), 
                                                    source, target))
}

# read in table with survey years
cdcee_year <- rio::import("1_harmonization/harmonization/cdcee_wave_year.xlsx") %>%
  mutate(t_wave = as.character(t_wave))

# apply final corrections
cdcee_harm <- cbind(data_small, harmonized_vars) %>% 
  left_join(cdcee_year) %>%
  # delete Krasnoyarsk sample and East-Germany sample in 1991
  filter(V3 != 16, !(V3 == 5 & V4 == 1)) %>%
  mutate(t_project = "CDCEE",
         t_caseid = V2,
         # fill in 2000 as survey year for Germany
         t_year = ifelse(t_wave == 2 & t_cntry == "DE", 2000, t_year),
         # assign age to age of completion of education for those "still at school"
         t_ageedu = ifelse(t_ageedu == 96, t_age, t_ageedu),
         t_cntry = ifelse(substr(t_cntry, 1, 2) == "DE" & V4 == 2, "DE", t_cntry)) %>% 
  select(starts_with("t_"), starts_with("trust")) %>%
  sjlabelled::remove_all_labels()

# save
saveRDS(cdcee_harm, "1_harmonization/CDCEE.Rds")




# EB (Eurobarometer) --------------------

# information about waves, years of survey, and weight variables
eb_filenames <- rio::import("1_harmonization/harmonization/eb_filenames.xlsx") %>%
  separate(wave, sep = " ", 
           into = c("file", "euro", "wave", "months", "year")) %>%
  select(-file, -euro) %>%
  mutate(months = gsub('\\(', '', months),
         year = gsub('\\)', '', year))

eb_waves <- eb_filenames %>% pull(wave)

eb_cntry_weight <- eb_filenames %>% select(wave, t_cntry = cntry_var, t_weight = weight_var) %>%
  gather(target_var, src_var, 2:3) %>%
  mutate(study = "EB",
         study_wave = paste(study, wave, sep = "_"),
         source = "GESIS")

# read in the survey data
path <- paste0(data_path, "EB")

filenames <- eb_filenames$file_name_full

# get variable names from the vars_all table
vars_eb <- vars_all %>% filter(study == "EB")


eb_small <- list()

for (i in filenames) {
  
  wavenum <- eb_filenames %>% filter(file_name == substr(i, 1, 6)) %>% 
    pull(wave) %>% as.character()
  
  weight_recode <- eb_filenames %>% filter(file_name == substr(i, 1, 6)) %>%
    pull(weight_recode)
  
  variables <- vars_eb %>% filter(wave == wavenum) %>% pull(src_var)
  caseid_var <- vars_eb %>% 
    filter(wave == wavenum, target_var == "t_caseid") %>% 
    pull(src_var)
  
  eb_small[[i]] <- haven::read_sav(file.path(path, i), user_na = TRUE) %>% 
    janitor::clean_names() %>%
    mutate(t_caseid = as.character(eval(parse(text = caseid_var))),
           t_caseid_n = row_number(),
           t_wave = wavenum,
           t_weight = eval(parse(text = weight_recode))) %>%
    select(all_of(variables), starts_with("t_"))
}


# read in cross-walk table
cwt_eb <- rio::import("1_harmonization/cwt/eb_cwt.xlsx") %>%
  drop_na(target_var) %>%
  split(., .$file_name)

# recoding based on the cross-walk table
names(eb_small) <- substr(names(eb_small), 1, 6)

cwt_eb <- cwt_eb[names(eb_small)]
names(cwt_eb) == names(eb_small)

eb_harm <- list()

for (j in 1:length(cwt_eb)) {
  
  name_table <- names(cwt_eb)[j]
  
  target_vars <- unique(cwt_eb[[j]]$target_var)
  source_vars <- unique(cwt_eb[[j]]$var_name)
  data_small <- eb_small[[name_table]] %>% zap_labels()
  
  harmonized_vars <- list()
  
  for (i in 1:length(target_vars)) {
    
    target_var_input = target_vars[i]
    
    source <- cwt_eb[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(value_code)
    
    target <- cwt_eb[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(target_value)
    
    source_varname <- cwt_eb[[j]] %>%
      filter(target_var == target_var_input) %>%
      pull(var_name) %>% .[1]
    
    harmonized_vars[[i]] <- data_small %>%
      transmute(!!target_var_input := plyr::mapvalues(as.character(get(source_varname)), 
                                                      source, target))
  }
  
  eb_harm[[name_table]] <- cbind(data_small, harmonized_vars)
  
}
names(eb_harm) <- names(cwt_eb)

# apply final corrections
eb_harm_df <- bind_rows(eb_harm, .id = "file_name") %>%
  filter(t_cntry != "CY-TCC") %>%
  mutate_at(vars(starts_with("trust")), as.numeric) %>%
  left_join(eb_filenames, by = c("t_wave" = "wave")) %>%
  rename(t_year = year) %>%
  mutate(t_project = "EB",
         t_cntry_subs = t_cntry,
         t_cntry = substr(t_cntry_subs, 1, 2)) %>%
  select(starts_with("t_"), starts_with("trust")) %>%
  mutate(t_weight = ifelse(t_cntry == "NO" & t_wave == "54.1", 1, t_weight))


# save
saveRDS(eb_harm_df, "1_harmonization/EB.Rds")





# EQLS (European Quality of Life Survey) -------------

# get variable names from the vars_all table
vars_eqls <- vars_all %>% filter(study == "EQLS")

# read in the survey data
eqls_1_4 <- haven::read_sav(paste0(data_path, 
                                   "EQLS/eqls_integrated_trend_2003-2016.zip"), 
                            user_na = TRUE) %>%
  select(vars_eqls$src_var) %>%
  zap_labels() %>%
  mutate(t_cntry = plyr::mapvalues(Y16_Country,
                                   c(1:36),
                                   c("AT", "BE", "BG", "CY", "CZ", 
                                     "DE", "DK", "EE", "GR", "ES", 
                                     "FI", "FR", "HR", "HU", "IE", 
                                     "IT", "LT", "LU", "LV", "MT",
                                     "NL", "PL", "PT", "RO", "SE", 
                                     "SI", "SK", "GB", "AL", "IS",
                                     "XK", "ME", "MK", "NO", "RS", 
                                     "TR")),
         t_caseid = as.character(Y16_uniqueid),
         t_caseid_n = row_number(),
         t_weight = WCalib,
         t_weight_des = Design_Weight,
         t_wave = Wave,
         t_year = plyr::mapvalues(Wave, c(1,2,3,4), c(2003, 2007, 2011, 2016)),
         t_project = "EQLS")

# read in cross-walk table
cwt_eqls <- rio::import("1_harmonization/cwt/eqls_cwt.xlsx") %>%
  drop_na(target_var)

# recoding based on the cross-walk table
target_vars <- unique(cwt_eqls$target_var)
source_vars <- unique(cwt_eqls$var_name)
data_small <- eqls_1_4 %>% 
  zap_labels() %>% 
  select(starts_with("t_"), all_of(source_vars))

harmonized_vars <- list()

for (i in 1:length(target_vars)) {
  
  target_var_input = target_vars[i]
  
  source <- cwt_eqls %>% 
    filter(target_var == target_var_input) %>%
    pull(value_code)
  
  target <- cwt_eqls %>% 
    filter(target_var == target_var_input) %>%
    pull(target_value)
  
  source_varname <- cwt_eqls %>%
    filter(target_var == target_var_input) %>%
    pull(var_name) %>% .[1]
  
  harmonized_vars[[i]] <- data_small %>%
    transmute(!!target_var_input := plyr::mapvalues(as.character(get(source_varname)), 
                                                    source, target))
}

# apply final corrections
eqls_harm <- cbind(data_small, harmonized_vars) %>%
  mutate_at(vars(starts_with("trust"), t_isced, t_educ, t_female, t_age), 
            as.numeric) %>%
  select(starts_with("t_"), starts_with("trust")) %>%
  filter(t_year != 2003) %>%
  # the design weight only accounts for the entire design in wave 4
  mutate(t_weight_des = ifelse(t_wave == 4, t_weight_des, 1))

# save
saveRDS(eqls_harm, "1_harmonization/EQLS.Rds")




# ESS (European Social Survey) -----------

# get variable names from the vars_all table
vars_ess <- vars_all %>% filter(study == "ESS")

# read in the survey data
path <- paste0(data_path, "ESS")
temp <- list.files(path = path, pattern = ".zip$")

f <- file.path(path, temp)

variables <- vars_ess %>% pull(src_var)
caseid_var <- vars_ess %>% filter(target_var == "t_caseid") %>% pull(src_var)

ess_small <- list()

for (i in 1:9) {
  
  ess_small[[i]] <- haven::read_sav(f[i], user_na = TRUE, encoding = "UTF-8") %>%
    select(any_of(variables),
           starts_with("inwy"),
           one_of("edulvla", "edulvlb"),
           starts_with("inwy")) %>%
    remove_labels() %>%
    mutate(t_caseid_n = row_number())
}
names(ess_small) <- c(1:9)

ess_merge <- bind_rows(ess_small)
# saveRDS(ess_merge, "ess_merge.rds")
# ess_merge <- readRDS("ess_merge.rds")

# read in cross-walk table
cwt_ess <- rio::import("1_harmonization/cwt/ess_cwt.xlsx") %>%
  drop_na(target_var)

# recoding based on the cross-walk table
target_vars <- unique(cwt_ess$target_var)
source_vars <- unique(cwt_ess$var_name)
data_small <- ess_merge %>% zap_labels()

harmonized_vars <- list()
for (i in 1:length(target_vars)) {
  
  target_var_input = target_vars[i]
  
  source <- cwt_ess %>% 
    filter(target_var == target_var_input) %>%
    pull(value_code)
  
  target <- cwt_ess %>% 
    filter(target_var == target_var_input) %>%
    pull(target_value)
  
  source_varname <- cwt_ess %>%
    filter(target_var == target_var_input) %>%
    pull(var_name) %>% .[1]
  
  harmonized_vars[[i]] <- data_small %>%
    transmute(!!target_var_input := plyr::mapvalues(as.character(get(source_varname)), 
                                                    source, target))
}

# apply final corrections
ess_harm <- cbind(data_small, harmonized_vars) %>%
  mutate_at(vars(starts_with("trust"), starts_with("t_educ"), 
                 starts_with("t_year"), t_age, t_female, t_polint,
                 t_satdem, t_satlife, t_lrscale), as.numeric) %>%
  mutate(t_project = "ESS",
         t_caseid = idno,
         t_year = coalesce(t_year, t_year2, t_year3),
         t_educ = coalesce(t_educ, t_educ2, t_educ3),
         # fill in missing year for Estonia in round 5
         t_year = ifelse(t_cntry == "EE" & t_wave == 5, 2011, t_year),
         t_year = ifelse(t_year == 9999, NA, t_year),
         t_weight_des = dweight,
         t_weight_pst = ifelse(t_wave == 9, 1, pspwght)) %>%
  group_by(t_cntry, t_wave) %>%
  mutate(t_year = round(mean(t_year, na.rm = TRUE), 0)) %>%
  ungroup() %>%
  select(starts_with("t_"), starts_with("trust"), 
         -t_year2, -t_year3, -t_educ2, -t_educ3) %>%
  sjlabelled::remove_all_labels()

# save
saveRDS(ess_harm, "1_harmonization/ESS.Rds")





# EVS & WVS (European Values Study and WOrld Values Survey) --------------

### SPECIAL ###
# WVS/3/NO: correct age-education, which really is schooling years, 
# and needs subtracting 6


# get variable names from the vars_all table
vars_evs_wvs <- vars_all %>% filter(study %in% c("EVS", "WVS"))

# read in the survey data
evs_5 <- read_sav(paste0(data_path, "EVS/ZA7500_v3-0-0.sav.zip"), 
                  user_na = TRUE) %>% 
  select(all_of(vars_evs_wvs$src_var[vars_evs_wvs$study_wave == "EVS_5"])) %>%
  mutate(t_caseid_n = row_number())

evs_1_4 <- read_sav(paste0(data_path, "EVS/ZA4804_v3-0-0.sav"), encoding = "latin1", 
                    user_na = TRUE) %>% 
  select(all_of(vars_evs_wvs$src_var[vars_evs_wvs$study_wave == "EVS_1_4"])) %>%
  mutate(t_caseid_n = row_number())

wvs_1_6 <- read_sav(paste0(data_path, "WVS/F00008388-WVS_Longitudinal_1981_2016_spss_v20180912.zip"), 
                    user_na = TRUE) %>% 
  select(all_of(vars_evs_wvs$src_var[vars_evs_wvs$study_wave == "WVS_1_6"])) %>%
  mutate(t_caseid_n = row_number())

# read in cross-walk table
cwt_evs_wvs <- rio::import("1_harmonization/cwt/evs_wvs_cwt.xlsx") %>%
  drop_na(target_var) %>%
  split(., .$file)

evs_wvs <- list(evs_1_4, evs_5, wvs_1_6)
names(evs_wvs) <- c("evs_1_4", "evs_5", "wvs_1_6")


# recoding based on the cross-walk table
evs_wvs_harm <- list()

for (j in 1:length(cwt_evs_wvs)) {
  
  name_table <- names(cwt_evs_wvs)[j]
  
  target_vars <- unique(cwt_evs_wvs[[j]]$target_var)
  source_vars <- unique(cwt_evs_wvs[[j]]$var_name)
  data_small <- evs_wvs[[name_table]] %>% zap_labels()
  
  harmonized_vars <- list()
  
  for (i in 1:length(target_vars)) {
    
    target_var_input = target_vars[i]
    
    source <- cwt_evs_wvs[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(value_code)
    
    target <- cwt_evs_wvs[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(target_value)
    
    source_varname <- cwt_evs_wvs[[j]] %>%
      filter(target_var == target_var_input) %>%
      pull(var_name) %>% .[1]
    
    harmonized_vars[[i]] <- data_small %>%
      transmute(!!target_var_input := plyr::mapvalues(as.character(get(source_varname)), 
                                                   source, target))
  }
  evs_wvs_harm[[name_table]] <- cbind(data_small, harmonized_vars)
}
names(evs_wvs_harm) <- names(cwt_evs_wvs)

# apply final corrections
evs_wvs_harm_df <- bind_rows(evs_wvs_harm, .id = "file") %>%
  mutate_at(vars(starts_with("trust"), t_age, t_female, t_ageedu, t_educ,
                 t_satlife, t_satdem, t_polint, t_lrscale), as.numeric) %>%
  mutate(t_caseid = coalesce(S006, caseno, S006),
         t_project = substr(file, 1, 3),
         t_wave = ifelse(is.na(t_wave), "5", t_wave),
         # correct lrscale in WVS/4-5/India, which de facto was asked on a 5-point scale
         t_lrscale = ifelse(t_cntry == "IN" & t_project == "WVS" & t_wave == 5,
                            plyr::mapvalues(t_lrscale, 
                                            c(1,3,5,7,10), 
                                            c(1,2,3,4,5)),
                            t_lrscale),
         t_lrscale = ifelse(t_cntry == "IN" & t_project == "WVS" & t_wave == 4,
                            plyr::mapvalues(t_lrscale, 
                                            c(1,2, 3,5,8,9,10), 
                                            c(1,NA,2,3,4,NA,5)),
                            t_lrscale)) %>%
  select(starts_with("t_"), starts_with("trust")) %>%
  # correct age-education in Norway, which really is schooling years
  mutate(t_ageedu = ifelse(t_project == "wvs" & t_wave == 3 & t_cntry == "NO", t_ageedu + 6, t_ageedu)) %>%
  # remove t_polint for WVS/2/CZ-SK, which onyl has two realized values - likely an error
  mutate(t_polint = ifelse(t_wave == 2 & t_cntry %in% c("SK", "CZ"), NA, t_polint)) %>% 
  sjlabelled::remove_all_labels()

# save
saveRDS(evs_wvs_harm_df, "1_harmonization/EVS_WVS.Rds")





# IntUne (Integrated and United) ------------

### SPECIAL ###
# "still at school" has a separate code. Needs converting into 
# (current) schooling years # and education level based on age.

# read in the survey data
path <- paste0(data_path, "IntUne")
temp <- list.files(path = path, pattern = ".sav$")

f <- file.path(path, temp)

intune_all <- lapply(f, haven::read_sav, user_na = TRUE)
names(intune_all) <- c("2009", "2007")

for(i in c("2009", "2007")) { 
  intune_all[[i]]$Source <- i
  var_label(intune_all[[i]]$Source) <- i
  intune_all[[i]] <- intune_all[[i]] %>% dplyr::select(Source, everything())
}

# get variable names from the vars_all table
vars_intune <- vars_all %>% filter(study == "IntUne")

# keep only the needed variables
intune_small <- list()
for (i in names(intune_all)) {
  
  variables <- vars_intune %>% filter(wave == i) %>% pull(src_var)
  
  caseid_var <- vars_intune %>% 
    filter(wave == i, target_var == "t_caseid") %>% 
    pull(src_var)
  
  weight_var <- vars_intune %>% 
    filter(wave == i, target_var %in% c("t_weight_pst", "t_weight_despst")) %>% 
    pull(src_var)
  
  intune_small[[i]] <- intune_all[[i]] %>% 
    janitor::clean_names() %>%
    select(all_of(variables)) %>%
    mutate(t_caseid = eval(parse(text = caseid_var)),
           t_caseid_n = row_number(),
           t_project = "IntUne",
           t_wave = i,
           t_weight_pst = weight_var)
}

# read in cross-walk table
cwt_intune <- rio::import("1_harmonization/cwt/intune_cwt.xlsx") %>%
  drop_na(target_var) %>%
  split(., .$wave)

# recoding based on the cross-walk table
intune_harm <- list()
for (j in 1:length(cwt_intune)) {
  
  wave <- names(cwt_intune)[j]
  
  target_vars <- unique(cwt_intune[[j]]$target_var)
  source_vars <- unique(cwt_intune[[j]]$var_name)
  data_small <- intune_small[[wave]] %>% zap_labels()
  
  harmonized_vars <- list()
  
  for (i in 1:length(target_vars)) {
    
    target_var_input = target_vars[i]
    
    source <- cwt_intune[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(value_code)
    
    target <- cwt_intune[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(target_value)
    
    source_varname <- cwt_intune[[j]] %>%
      filter(target_var == target_var_input) %>%
      pull(var_name) %>% .[1]
    
    harmonized_vars[[i]] <- data_small %>%
      transmute(!!target_var_input := plyr::mapvalues(as.character(get(source_varname)), 
                                                      source, target))
  }
  intune_harm[[wave]] <- cbind(data_small, harmonized_vars)
}

# apply final corrections
intune_harm_df <- bind_rows(intune_harm, .id = "wave") %>%
  mutate_at(vars(starts_with("trust"), t_yob, t_educ, t_lrscale,
                 t_satdem, t_polint), as.numeric) %>%
  mutate(t_year = as.numeric(t_wave),
         t_age = t_year - t_yob,
         t_educ = ifelse(t_educ == 9 & t_age <= 17, 1, t_educ),
         t_educ = ifelse(t_educ == 9 & t_age > 17 & t_age <= 24, 2, t_educ),
         t_educ = ifelse(t_educ == 9 & t_age > 24, 3, t_educ)) %>%
  select(starts_with("t_"), starts_with("trust")) %>%
  sjlabelled::remove_all_labels()

# save
saveRDS(intune_harm_df, "1_harmonization/IntUne.Rds")





# ISSP (International Social Survey Programme) ------------

### SPECIAL ###
# ISSP 1991: country-specific education variables in a single variable: 
#  v99 (needs splitting into separate variables for cwt)
# ISSP 1991, SI: trust scale has 4 points not 5 points (cf. codebook)
# ISSP 1998: country-specific education variables: between V317 and V331
# ISSP 2008: country-specific education variables: AT_DEGR and ZA_DEGR

# mapping of countries and waves to years
issp_year <- rio::import("1_harmonization/harmonization/issp_wave_year.xlsx") %>%
  mutate(t_wave = as.numeric(t_wave))

# read in the survey data
issp <- list()
issp[["1991"]] <- haven::read_sav(paste0(data_path, "ISSP/ZA2150_2006-04-24.sav"),
                                  user_na = TRUE)
issp[["1998"]] <- haven::read_sav(paste0(data_path, "ISSP/ZA3190_2006-04-27.sav"),
                                  user_na = TRUE)
issp[["2008"]] <- haven::read_sav(paste0(data_path, "ISSP/ZA4950_v2-2-0.sav"),
                                  user_na = TRUE)

# get variable names from the vars_all table
vars_issp <- vars_all %>% filter(study == "ISSP")

# adding country-specific education variables 

educ_vars <- list()
educ_vars[["1998"]] <- c("V317", "V331")
educ_vars[["2008"]] <- c("AT_DEGR", "ZA_DEGR")
educ_vars[["2018"]] <- c("AT_DEGR", "US_DEGR")

issp_small <- issp

for (i in names(issp_small)) {
  
  variables <- vars_issp %>% filter(wave == i) %>% pull(src_var) %>% tolower()
  
  caseid_var <- vars_issp %>% 
    filter(wave == i, target_var == "t_caseid") %>% 
    pull(src_var) %>% 
    tolower()
  
  weight_var <- vars_issp %>% 
    filter(wave == i, target_var == "t_weight_despst") %>% 
    pull(src_var) %>% 
    tolower()
  
  educ_var <- vars_issp %>% 
    filter(wave == i, target_var == "t_educ") %>% 
    pull(src_var) %>% 
    tolower()
  
  if (i %in% c("1998", "2008", "2018")) {
    educ_vars_list <- issp[[i]] %>% select(educ_vars[[i]][1]:educ_vars[[i]][2]) %>% names()
    variables = c(variables, educ_vars_list) %>% tolower()
  }
  
  issp_small[[i]] <- issp_small[[i]] %>% 
    janitor::clean_names() %>%
    select(all_of(variables)) %>%
    mutate(t_caseid = as.character(eval(parse(text = caseid_var))),
           t_caseid_n = row_number(),
           t_project = "ISSP",
           t_wave = i,
           t_weight_pst = eval(parse(text = weight_var)))
}

# splitting country-specific education coding into separate variables
issp_small[[1]] <- issp_small[[1]] %>%
  mutate(s_educ_de = ifelse(v3 %in% c(1,2), v99, NA),
         s_educ_gb = ifelse(v3 %in% c(3,4), v99, NA),
         s_educ_us = ifelse(v3 == 5, v99, NA),
         s_educ_hu = ifelse(v3 == 6, v99, NA),
         s_educ_nl = ifelse(v3 == 7, v99, NA),
         s_educ_it = ifelse(v3 == 8, v99, NA),
         s_educ_ieplau = ifelse(v3 %in% c(9,13,18), v99, NA),
         s_educ_no = ifelse(v3 == 10, v99, NA),
         s_educ_at = ifelse(v3 == 11, v99, NA),
         s_educ_si = ifelse(v3 == 12, v99, NA),
         s_educ_il = ifelse(v3 == 14, v99, NA),
         s_educ_ph = ifelse(v3 == 15, v99, NA),
         s_educ_nz = ifelse(v3 == 16, v99, NA),
         s_educ_ru = ifelse(v3 == 17, v99, NA)) %>%
  mutate(s_yrsedu_de = ifelse(v3 %in% c(1,2), v98, NA),
         s_yrsedu_gb = ifelse(v3 %in% c(3,4), v98, NA),
         s_yrsedu_us = ifelse(v3 == 5, v98, NA),
         s_yrsedu_hu = ifelse(v3 == 6, v98, NA),
         s_yrsedu_nl = ifelse(v3 == 7, v98, NA),
         s_yrsedu_it = ifelse(v3 == 8, v98, NA),
         s_yrsedu_ie = ifelse(v3 == 9, v98, NA),
         s_yrsedu_pl = ifelse(v3 == 13, v98, NA),
         s_yrsedu_au = ifelse(v3 == 18, v98, NA),
         s_yrsedu_no = ifelse(v3 == 10, v98, NA),
         s_yrsedu_at = ifelse(v3 == 11, v98, NA),
         s_yrsedu_si = ifelse(v3 == 12, v98, NA),
         s_yrsedu_il = ifelse(v3 == 14, v98, NA),
         s_yrsedu_ph = ifelse(v3 == 15, v98, NA),
         s_yrsedu_nz = ifelse(v3 == 16, v98, NA),
         s_yrsedu_ru = ifelse(v3 == 17, v98, NA))

# read in cross-walk table
cwt_issp <- rio::import("1_harmonization/cwt/issp_cwt.xlsx") %>%
  drop_na(target_var) %>%
  split(., .$wave)

# recoding based on the cross-walk table
names(cwt_issp) == names(issp_small)

issp_harm <- list()
for (j in 1:length(cwt_issp)) {
  
  target_vars <- unique(cwt_issp[[j]]$target_var)
  source_vars <- unique(cwt_issp[[j]]$var_name)
  data_small <- issp_small[[j]] %>% zap_labels()
  
  harmonized_vars <- list()
  
  for (i in 1:length(target_vars)) {
    
    target_var_input = target_vars[i]
    
    source <- cwt_issp[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(value_code)
    
    target <- cwt_issp[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(target_value)
    
    source_varname <- cwt_issp[[j]] %>%
      filter(target_var == target_var_input) %>%
      pull(var_name) %>% .[1]
    
    harmonized_vars[[i]] <- data_small %>%
      transmute(!!target_var_input := plyr::mapvalues(as.character(get(source_varname)), 
                                                      source, target))
  }
  issp_harm[[j]] <- cbind(data_small, harmonized_vars)
}
names(issp_harm) <- names(cwt_issp)

# apply final corrections
issp_harm_df <- bind_rows(issp_harm, .id = "wave") %>%
  mutate_at(vars(starts_with("trust"), t_wave, t_age, 
                 starts_with("t_educ"), starts_with("t_yrsedu")), as.numeric) %>%
  left_join(issp_year) %>%
  mutate(t_yrsedu = ifelse(t_cntry == "GB-NIR" & t_wave == 2008 & t_yrsedu == 0, NA, t_yrsedu),
         t_educ = coalesce(!!! select(., starts_with("t_educ_"), t_educ)),
         t_yrsedu = coalesce(!!! select(., t_yrsedu, starts_with("t_yrsedu_"))),
         t_yrsedu = ifelse(t_yrsedu %in% c(95, 96), t_age - 6, t_yrsedu),
         t_yrsedu = ifelse(t_cntry == "RU" & t_wave == "1991", NA, t_yrsedu),
         # fix trust coding in SI/1991
         trust_parl = ifelse(t_cntry == "SI" & t_wave == "1991",
                             plyr::mapvalues(trust_parl, c(1,3,4,5), c(1,2,3,4)),
                             trust_parl),
         trust_jus = ifelse(t_cntry == "SI" & t_wave == "1991",
                            plyr::mapvalues(trust_jus, c(1,3,4,5), c(1,2,3,4)),
                            trust_jus),
         trust_rel = ifelse(t_cntry == "SI" & t_wave == "1991",
                            plyr::mapvalues(trust_rel, c(1,3,4,5), c(1,2,3,4)),
                            trust_rel)) %>%
  select(starts_with("t_"), starts_with("trust"), 
         -starts_with("t_educ_"), -starts_with("t_yrsedu_")) %>%
  sjlabelled::remove_all_labels()

# save
saveRDS(issp_harm_df, "1_harmonization/ISSP.Rds")





# LITS (Life in Transition Survey) ------------

# read in the survey data

lits <- list()
lits[["2006"]] <- read.dta13(paste0(data_path, "LITS/LITS 2006 data.dta"),
                             convert.factors = FALSE)

lits[["2010"]] <- read.dta13(paste0(data_path, "LITS/lits2.dta"),
                             convert.factors = FALSE)

lits[["2016"]] <- read.dta13(paste0(data_path, "LITS/LiTS III.dta"),
                             convert.factors = FALSE)

# get variable names from the vars_all table
vars_lits <- vars_all %>% filter(study == "LITS")

# select only the needed variables
lits_small <- lits
for (i in names(lits_small)) {
  
  variables <- vars_lits %>% 
    filter(year == i) %>% 
    pull(src_var)
  
  caseid_var <- vars_lits %>% 
    filter(year == i, target_var == "t_caseid") %>% 
    pull(src_var)
  
  weight_var <- vars_lits %>% 
    filter(year == i, target_var == "t_weight_despst") %>% 
    pull(src_var)
  
  cntry_var <- vars_lits %>% 
    filter(year == i, target_var == "t_cntry") %>% 
    pull(src_var)
  
  lits_small[[i]] <- lits_small[[i]] %>% 
    select(all_of(variables)) %>%
    mutate(t_caseid = as.character(eval(parse(text = caseid_var))),
           t_caseid_n = row_number(),
           t_project = "LITS",
           t_wave = i,
           t_weight_pst = eval(parse(text = weight_var)),
           t_cntry = eval(parse(text = cntry_var)))
}

# recodes
lits_small[[1]] <- lits_small[[1]] %>%
  mutate(t_female = plyr::mapvalues(genderB, c(1, 0), c(0, 1)),
         trust_army = plyr::mapvalues(q303_6, c(1:6), c(1:5,NA)),
         trust_gov = plyr::mapvalues(q303_2, c(1:6), c(1:5,NA)),
         trust_jus = plyr::mapvalues(q303_4, c(1:6), c(1:5,NA)),
         trust_parl = plyr::mapvalues(q303_3, c(1:6), c(1:5,NA)),
         trust_police = plyr::mapvalues(q303_7, c(1:6), c(1:5,NA)),
         trust_polpart = plyr::mapvalues(q303_5, c(1:6), c(1:5,NA)),
         trust_rel = plyr::mapvalues(q303_12, c(1:6), c(1:5,NA)),
         t_age = ageB,
         t_ageedu_year = q502,
         t_ageedu = t_ageedu_year - (2006 - t_age),
         t_ageedu = ifelse(t_ageedu < 6, NA, t_ageedu),
         t_educ = plyr::mapvalues(q501, c(1:6), c(1,1,2,2,3,3)),
         t_satlife = plyr::mapvalues(q301_7, c(1:7), c(1,2,3,4,5,NA,NA))) %>%
  select(starts_with("t_"), starts_with("trust"))

lits_small[[2]] <- lits_small[[2]] %>%
  mutate(t_female = plyr::mapvalues(respondentgender, c(-1,1,2), c(NA, 0, 1)),
         trust_army = plyr::mapvalues(q303h, c(-98, -97, -1, 1:5), c(NA, NA, NA, 1:5)),
         trust_gov = plyr::mapvalues(q303b, c(-98, -97, -1, 1:5), c(NA, NA, NA, 1:5)),
         trust_jus = plyr::mapvalues(q303f, c(-98, -97, -1, 1:5), c(NA, NA, NA, 1:5)),
         trust_parl = plyr::mapvalues(q303e, c(-98, -97, -1, 1:5), c(NA, NA, NA, 1:5)),
         trust_police = plyr::mapvalues(q303i, c(-98, -97, -1, 1:5), c(NA, NA, NA, 1:5)),
         trust_polpart = plyr::mapvalues(q303g, c(-98, -97, -1, 1:5), c(NA, NA, NA, 1:5)),
         trust_rel = plyr::mapvalues(q303n, c(-98, -97, -1, 1:5), c(NA, NA, NA, 1:5)),
         t_age = ifelse(respondentage > 17, respondentage, NA),
         t_ageedu_year = ifelse(q516 > 1900, q516, NA),
         t_ageedu = t_ageedu_year - (2011 - t_age),
         t_ageedu = ifelse(t_ageedu < 6, NA, t_ageedu),
         t_educ = plyr::mapvalues(q515, c(-1,1:7), c(NA,1,1,1,2,2,3,3)),
         t_satlife = plyr::mapvalues(q722, c(-1, 1:10), c(NA, 1:10))) %>%
  select(starts_with("t_"), starts_with("trust"))

lits_small[[3]] <- lits_small[[3]] %>%
  mutate(t_female = plyr::mapvalues(gender_pr, c(1,2), c(0, 1)),
         trust_army = plyr::mapvalues(q404h, c(-98, -97, 1:5), c(NA, NA, 1:5)),
         trust_gov = plyr::mapvalues(q404b, c(-98, -97, 1:5), c(NA, NA, 1:5)),
         trust_jus = plyr::mapvalues(q404f, c(-98, -97, 1:5), c(NA, NA, 1:5)),
         trust_parl = plyr::mapvalues(q404e, c(-98, -97, 1:5), c(NA, NA, 1:5)),
         trust_police = plyr::mapvalues(q404i, c(-98, -97, 1:5), c(NA, NA, 1:5)),
         trust_polpart = plyr::mapvalues(q404g, c(-98, -97, 1:5), c(NA, NA, 1:5)),
         trust_rel = plyr::mapvalues(q404n, c(-98, -97, 1:5), c(NA, NA, 1:5)),
         t_age = age_pr,
         t_educ = plyr::mapvalues(q109_1, c(1:8), c(1,1,1,2,2,2,3,3)),
         t_satlife = plyr::mapvalues(q401e, c(-98, -97, 1:5), c(NA, NA, 1:5))) %>%
  select(starts_with("t_"), starts_with("trust"))

# apply final corrections
lits_harm <- bind_rows(lits_small, .id = "t_year") %>%
  mutate(t_cntry = countrycode::countrycode(t_cntry, "country.name", "iso2c"),
         t_cntry = ifelse(is.na(t_cntry), "XK", t_cntry)) %>%
  sjlabelled::remove_all_labels()

# save
saveRDS(lits_harm, "1_harmonization/LITS.Rds")




# NBB (New Baltic Barometer) separate files ----------

var_list <- vars_all %>% filter(study == "NBB")

# read in data
nbb <- haven::read_sav(paste0(data_path, "NBB/mbltrv.sav"), 
                       user_na = TRUE) %>%
  mutate(t_caseid_n = row_number()) %>%
  select(unique(vars_all$src_var[vars_all$study == "NBB"]), starts_with("t_"))

# read in cross-walk table
cwt_nbb <- rio::import("1_harmonization/cwt/nbb_cwt.xlsx")

# recoding based on the cross-walk table

target_vars <- unique(cwt_nbb$target_var)
source_vars <- unique(cwt_nbb$var_name)
data_small <- nbb %>% zap_labels()

harmonized_vars <- list()

for (i in 1:length(target_vars)) {
  
  target_var_input = target_vars[i]
  
  source <- cwt_nbb %>% 
    filter(target_var == target_var_input) %>%
    pull(value_code)
  
  target <- cwt_nbb %>% 
    filter(target_var == target_var_input) %>%
    pull(target_value)
  
  source_varname <- cwt_nbb %>%
    filter(target_var == target_var_input) %>%
    pull(var_name) %>% .[1]
  
  harmonized_vars[[i]] <- data_small %>%
    transmute(!!target_var_input := plyr::mapvalues(as.character(get(source_varname)), 
                                                    source, target))
}

# apply final corrections
nbb_harm <- cbind(data_small, harmonized_vars) %>%
  mutate(t_caseid = t_caseid_n,
         t_weight = w,
         t_year = year,
         t_project = "NBB",
         trust_army = coalesce(trust_army, trust_army2),
         trust_parl = coalesce(trust_parl, trust_parl2),
         trust_rel = coalesce(trust_rel, trust_rel2),
         trust_police = coalesce(trust_police, trust_police2),
         trust_polpart = coalesce(trust_polpart, trust_polpart2),
         trust_jus = coalesce(trust_jus, trust_jus2)) %>%
  mutate_at(vars(starts_with("trust"), t_age, t_female, t_educ, t_ageedu,
                 t_polint, t_lrscale, t_satdem, t_satlife), as.numeric) %>%
  select(starts_with("t_"), trust_army, trust_police, trust_parl, trust_rel, trust_polpart,
         trust_media, trust_jus, trust_gov, trust_gov) %>%
  sjlabelled::remove_all_labels()

# save
saveRDS(nbb_harm, "1_harmonization/NBB.Rds")




# NEB (New Europe Barometer) separate files ----------

neb_filenames <- data.frame(file_name_full = c("multnd92.sav", "multnd93.sav", 
                                               "multnd95.sav", "multnd98.sav", 
                                               "multne01.por", "multne04.por"),
                            wave = c("2", "3", "4", "5", "6", "7")) %>%
  mutate_all(as.character)


# path to folder with all CCEB data files
path <- paste0(data_path, "NEB/single waves")
list.files(path = path, pattern = "\\.por|sav$")

temp_sav <- list.files(path = path, pattern = ".sav$")
temp_por <- list.files(path = path, pattern = ".por$")

fsav <- file.path(path, temp_sav)
fpor <- file.path(path, temp_por)

neb_sav <- lapply(fsav, haven::read_sav, user_na = TRUE)
names(neb_sav) <- temp_sav

neb_por <- lapply(fpor, haven::read_por, user_na = TRUE)
names(neb_por) <- temp_por

neb_all <- append(neb_sav, neb_por)

neb_all <- neb_all[unique(neb_filenames$file_name_full)]
names(neb_all) <- unique(neb_filenames$wave)


cntry_map_table <- rio::import("1_harmonization/harmonization/NEB_cntry_codes.xlsx") %>%
  drop_na(target_value) %>%
  select(wave, value_code, target_value)


var_list <- vars_all %>% filter(study == "NEB")

neb_small <- neb_all

for (i in 1:length(neb_small)) {
  
  wavenum <- names(neb_small)[i]
  
  variables <- var_list %>% filter(wave == wavenum) %>% pull(src_var)
  
  var_cntry <- var_list %>% 
    filter(wave == wavenum, target_var == "t_cntry") %>% 
    pull(src_var)
  
  cntry_mapping <- cntry_map_table %>% filter(wave == wavenum)
  
  neb_small[[i]] <- neb_small[[i]] %>% 
    mutate(t_caseid_n = row_number(),
           t_caseid = t_caseid_n) %>%
    filter(eval(parse(text = var_cntry)) != "") %>%
    select(all_of(variables), t_caseid, t_caseid_n) %>%
    remove_attributes(c("na_range", "na_values"))
}

# read in cross-walk table
cwt_all_df1 <- rio::import("1_harmonization/cwt/neb_cwt.xlsx") %>%
  drop_na(target_var) %>%
  split(., .$t_wave)

names(cwt_all_df1) == names(neb_small)

neb_harm <- list()

for (j in 1:length(cwt_all_df1)) {
  
  target_vars <- unique(cwt_all_df1[[j]]$target_var)
  source_vars <- unique(cwt_all_df1[[j]]$var_name)
  data_small <- neb_small[[j]] %>% zap_labels()
  
  harmonized_vars <- list()
  
  for (i in 1:length(target_vars)) {
    
    target_var_input = target_vars[i]
    
    source <- cwt_all_df1[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(value_code)
    
    target <- cwt_all_df1[[j]] %>% 
      filter(target_var == target_var_input) %>%
      pull(target_value)
    
    source_varname <- cwt_all_df1[[j]] %>%
      filter(target_var == target_var_input) %>%
      pull(var_name) %>% .[1]
    
    harmonized_vars[[i]] <- data_small %>%
      transmute(!!target_var_input := plyr::mapvalues(as.character(get(source_varname)), 
                                                      source, target))
  }
  
  neb_harm[[j]] <- cbind(data_small, harmonized_vars)
  
}

names(neb_harm) <- names(cwt_all_df1)

neb_harm_df <- bind_rows(neb_harm, .id = "t_wave") %>%
  mutate_at(vars(starts_with("trust"), t_wave, t_age, 
                 starts_with("t_educ"), starts_with("t_yrsedu"),
                 t_polint, t_satdem), as.numeric) %>%
  select(starts_with("t_"), starts_with("trust")) %>%
  mutate(t_year = plyr::mapvalues(t_wave, 
                                  c(2:7),
                                  c(1992, 1993, 1995, 1998, 2001, 2004)),
         t_wave = as.character(t_wave),
         t_project = "NEB")

saveRDS(neb_harm_df, "1_harmonization/NEB.Rds")




# ALL -------------------------


# read in recoded data from different survey projects ------------------------

path <- "1_harmonization"
temp <- list.files(path = path, pattern = ".Rds$")
# 12 files:
# "ASES.Rds" "CCEB.Rds" "CDCEE.Rds" "EB.Rds" "EQLS.Rds" "ESS.Rds"  
# "EVS_WVS.Rds" "IntUne.Rds" "ISSP.Rds" "LITS.Rds" "NBB.Rds" "NEB.Rds"  

f <- file.path(path, temp)

all_list <- lapply(f, readRDS)
names(all_list) <- sub("\\..*", "", temp)

# unify variable types
for(i in names(all_list)) { 
  all_list[[i]] <- all_list[[i]] %>%
    mutate_at(vars(starts_with("t_weight"), t_year, starts_with("t_age"),
                   t_female, starts_with("t_educ"), starts_with("trust"),
                   starts_with("t_yrsedu"), starts_with("t_sat"), 
                   any_of("t_lrscale")), as.numeric) %>%
    mutate_at(vars(t_caseid, t_caseid_n, t_wave), as.character)
}


# clean and recode ------------------------

all <- bind_rows(all_list, .id = "src_file_name") %>%
  mutate(t_project = toupper(t_project),
         # create survey identifier
         survey = paste0(t_project, t_wave, t_cntry, t_year)) %>%
  group_by(survey) %>%
  mutate(# calculate mean of trust in parliament by survey
    mean_trust_parl = mean(trust_parl, na.rm = TRUE),
    t_weight = coalesce(t_weight, t_weight_pst, t_weight_despst),
    t_weight = ifelse(is.na(t_weight), 1, t_weight),
    t_weight_des = ifelse(is.na(t_weight_des), 1, t_weight_des),
    t_age = round(t_age)) %>%
  # drop surveys without trust in parliament
  drop_na(mean_trust_parl) %>%
  ungroup() %>%
  # remove duplicated surveys fromNEB/6&7 from EE, LT, and LV
  filter(!(t_cntry %in% c("EE", "LV", "LT") & t_project == "NEB" & t_wave %in% c("6", "7"))) %>%
  # identify the length of trust scales
  mutate(trust_npoints = plyr::mapvalues(t_project, 
                                         c("ASES", "CCEB", "CDCEE", "EB", "EQLS", "ESS", 
                                           "EVS", "INTUNE", "ISSP", "LITS", "NBB", "NEB", "WVS"),
                                         c(4, 2, 4, 2, 10, 11, 
                                           4, 11, 5, 5, 7, 7, 4)),
         trust_npoints = as.numeric(trust_npoints),
         trust_npoints = ifelse(t_project == "ISSP" & t_wave == "1991" & t_cntry == "SI",
                                4, trust_npoints),
         trust_npoints = ifelse(t_project == "NBB" & t_wave %in% c(1,3),
                                4, trust_npoints)) %>%
  # rescale trust variables to common 0-10 scale
  mutate(trust_parl_11 = 10/trust_npoints/2 + (trust_parl-1)*10/trust_npoints,
         trust_parl_11 = ifelse(trust_npoints == 11, trust_parl-1, trust_parl_11),
         trust_jus_11 = 10/trust_npoints/2 + (trust_jus-1)*10/trust_npoints,
         trust_jus_11 = ifelse(trust_npoints == 11, trust_jus-1, trust_jus_11),
         trust_polpart_11 = 10/trust_npoints/2 + (trust_polpart-1)*10/trust_npoints,
         trust_polpart_11 = ifelse(trust_npoints == 11, trust_polpart-1, trust_polpart_11),
         thres = trust_npoints - 1)


### Weights for DE and GB ------------------------
# 
# # Germany population data
# # https://www.statistik-bw.de/VGRdL/tbls/tab.jsp?rev=RV2014&tbl=tab20&lang=de-DE
# 
# library(rvest)
# 
# url_de <- "https://www.statistik-bw.de/VGRdL/tbls/tab.jsp?rev=RV2014&tbl=tab20&lang=de-DE"
# de_pop <- url_de %>%
#   html() %>%
#   html_nodes(xpath='//*[@id="tab01"]') %>%
#   html_table(fill = TRUE)
# 
# de_pop <- de_pop[[1]][-c(1,3), ]
# names(de_pop) <- de_pop[1,]
# de_pop <- de_pop[-1,]
# 
# de_pop <- de_pop %>% 
#   gather(land, pop, 2:18) %>%
#   filter(land != "D") %>%
#   mutate(subs = ifelse(land %in% c("BB", "MV", "SN", "ST", "TH"), "DE-E", "DE-W"),
#          pop = sub(".", "", pop, fixed = TRUE),
#          pop = sub(",", ".", pop, fixed = TRUE),
#          pop = as.numeric(pop),
#          Jahr = as.numeric(Jahr)) %>%
#   rename(year = Jahr) %>%
#   group_by(subs, year) %>%
#   summarise(pop = sum(pop)) %>%
#   group_by(year) %>%
#   mutate(prop_pop = pop / sum(pop))
# 
# 
# ## Germany ------
# 
# de_weight <- all %>% filter(t_cntry %in% c("DE-E", "DE-W")) %>%
#   count(t_project, t_year, t_cntry) %>%
#   group_by(t_project, t_year) %>%
#   mutate(prop_surv = n / sum(n)) %>%
#   left_join(de_pop, by = c("t_cntry" = "subs", "t_year" = "year")) %>%
#   mutate(pop_weight_de = prop_pop / prop_surv) %>%
#   select(t_project, t_cntry, t_year, pop_weight_de)
# 
# rio::export(de_weight, "de_weight.csv")
# 
# ## GB ------------
# 
# # United Kingdom population data
# # https://statswales.gov.wales/v/Hww7
# 
# gb_pop <- rio::import("gb_pop.csv") %>%
#   filter(age %in% c("Aged 16 to 64", "Aged 65 and over")) %>%
#   gather(subs, pop, 3:4) %>%
#   group_by(subs, year) %>%
#   summarise(pop = sum(pop)) %>%
#   filter(year < 2012) %>%
#   group_by(year) %>%
#   mutate(prop = pop / sum(pop)) %>%
#   # the proportions were stable 1991-2012, so just taking the mean
#   group_by(subs) %>%
#   summarise(prop_pop = mean(prop))
# 
# gb_weight <- all %>% filter(t_cntry %in% c("GB-GBN", "GB-NIR")) %>%
#   count(t_project, t_year, t_cntry) %>%
#   group_by(t_project, t_year) %>%
#   mutate(prop_surv = n / sum(n)) %>%
#   left_join(gb_pop, by = c("t_cntry" = "subs")) %>%
#   mutate(pop_weight_gb = prop_pop / prop_surv) %>%
#   select(t_project, t_cntry, t_year, pop_weight_gb)
#   
# rio::export(gb_weight, "gb_weight.csv")

# apply weights

de_weight <- rio::import("1_harmonization/harmonization/de_weight.csv")
gb_weight <- rio::import("1_harmonization/harmonization/gb_weight.csv")

all_weights <- all %>%
  left_join(de_weight) %>%
  left_join(gb_weight) %>%
  mutate(pop_weight_de = ifelse(is.na(pop_weight_de), 1, pop_weight_de),
         pop_weight_gb = ifelse(is.na(pop_weight_gb), 1, pop_weight_gb))


# recodes for final analysis --------------

# list of countries
countries <- c("AT", "BE", "BG", "CH", "CZ", "DE", "DK", "EE", "ES", "FI", 
               "FR", "GB", "GR", "HU", "IE", "IT", "LT", "LU", "LV", "NL", 
               "NO", "PL", "PT", "RO", "SE", "SI", "SK")

all_cat_27_edu3 <- all_weights %>% 
  filter(t_year >= 1989) %>%
  filter(substr(t_cntry, 1, 2) %in% countries) %>%
  filter(t_age >= 18 & t_age <= 74) %>%
  mutate(sex = plyr::mapvalues(t_female, c(0,1), c("M", "F")),
         age_cat = cut(t_age, c(19, 34, 54, 74), right = T),
         age_cat = as.character(age_cat),
         t_yrsedu = ifelse(is.na(t_yrsedu), t_ageedu - 6, t_yrsedu),
         t_yrsedu = ifelse(is.na(t_yrsedu), t_ageedu2 - 6, t_yrsedu),
         educ2_yrsedu = cut(t_yrsedu, c(0, 11, 120), right = T),
         educ2_yrsedu = as.numeric(educ2_yrsedu),
         educ2 = plyr::mapvalues(t_educ, c(1,2,3), c(1,2,2)),
         educ2 = ifelse(is.na(educ2), educ2_yrsedu, educ2),
         educ3_yrsedu = cut(t_yrsedu, c(0, 11, 15, 120), right = T),
         educ3_yrsedu = as.numeric(educ3_yrsedu),
         educ3 = ifelse(is.na(t_educ), educ3_yrsedu, t_educ),
         educ_filled_yrsedu = ifelse(is.na(t_educ) & !is.na(t_yrsedu), 1, 0),
         t_cntry_subs = t_cntry,
         t_cntry = substr(t_cntry_subs, 1, 2),
         survey = paste0(t_project, t_wave, t_cntry, t_year)) %>%
  drop_na(age_cat, t_female, educ2) %>%
  select(t_caseid, t_caseid_n, t_project, t_wave, t_year, 
         t_cntry_subs, t_cntry, survey, t_weight, t_weight_des,
         sex, t_age, age_cat, educ2, educ3, t_educ, thres, educ_filled_yrsedu,
         trust_parl, trust_parl_11, trust_jus, trust_jus_11, 
         trust_polpart, trust_polpart_11,
         pop_weight_de, pop_weight_gb) %>%
  group_by(survey) %>%
  mutate(educ_filled_yrsedu = round(mean(educ_filled_yrsedu, na.rm = TRUE), 0),
         pop_weight_de = pop_weight_de / mean(pop_weight_de),
         pop_weight_gb = pop_weight_gb / mean(pop_weight_gb),
         t_weight_des = t_weight_des / mean(t_weight_des),
         t_weight = t_weight / mean(t_weight),
         weight_total_des = t_weight_des * pop_weight_de * pop_weight_gb,
         weight_total_despst = t_weight * pop_weight_de * pop_weight_gb,
         weight_total_des = weight_total_des / mean(weight_total_des),
         weight_total_despst = weight_total_despst / mean(weight_total_despst),
         t_caseid_n = as.character(t_caseid_n)) %>%
  ungroup()


# final subset for analysis

all_cat_27_edu3_subset_2_20211212 <- all_cat_27_edu3 %>%
  select(t_caseid, t_caseid_n, t_project, t_wave, t_year, t_cntry, survey, 
         weight_total_des, weight_total_despst,
         sex, age_cat, educ3, thres, 
         trust_parl, trust_parl_11, trust_jus, trust_jus_11, 
         trust_polpart, trust_polpart_11)

saveRDS(all_cat_27_edu3_subset_2_20211212, "1_harmonization/all_cat_27_edu3_subset_2_20211212.rds")
# serves as input for model scripts in 2_trust_models

