# Modeling public opinion over time and space: Trust in state institutions in Europe, 1989-2019
# 
# 20 February 2023
#
# Replication materials: Cleaning of demographic data for post-stratification table


setwd("replication-check")

# Packages
library(eurostat) # for getting data from teh Eurostat
library(sjlabelled) # for dealing with variable and value labels
library(countrycode) # for switching between country code types
library(tidyverse) # for manipulating data


# List of countries
countries <- c("AT", "BE", "BG", "CH", "CZ", "DE", "DK", "EE", "ES", "FI", 
               "FR", "GB", "GR", "HU", "IE", "IT", "LT", "LU", "LV", "NL", 
               "NO", "PL", "PT", "RO", "SE", "SI", "SK")	


## 1. Getting age x sex data from Eurostat -------------

dem <- get_eurostat("demo_pjan", stringsAsFactors = FALSE) %>%
  # recode country codes to standard for Greece and the UK
  mutate(geo = ifelse(geo == "EL", "GR", geo),
         geo = ifelse(geo == "UK", "GB", geo)) %>%
  filter(!age %in% c("TOTAL", "UNK", "Y_LT1", "Y_OPEN"),
         sex != "T",
         geo %in% countries) %>%
  mutate(age = as.numeric(substring(age, 2)),
         time = as.numeric(substr(time, 1, 4))) %>%
  filter(age >= 20 & age <= 74,
         time >= 1989) %>%
  mutate(age_cat = cut(age, c(19, 34, 54, 74), right = T),
         age_cat = as.character(age_cat)) %>%
  group_by(geo, time, sex, age_cat) %>%
  summarise(npop = sum(values)) %>%
  arrange(geo, time, sex, age_cat) %>%
  ungroup() %>%
  complete(time = 1989:2019, geo, sex, age_cat) %>%
  arrange(geo, sex, age_cat, time) %>%
  mutate(npop = ifelse(is.na(npop), lead(npop), npop))
# counts of population by sex and age category
# input for MRP scripts: 4_poststratification/mrp.R

## 2. Getting education x age x sex data from Eurostat -------------

edu <- get_eurostat("lfsa_pgaed", time_format = "num", stringsAsFactors = FALSE) %>%
  mutate(geo = ifelse(geo == "EL", "GR", geo),
         geo = ifelse(geo == "UK", "GB", geo)) %>%
  spread(isced11, values) %>%
  filter(geo %in% countries,
         age != "Y15-19",
         time >= 1989) %>%
  rename(`ED3-4` = `ED3_4`) %>%
  mutate(`ED5-8` = ifelse(is.na(`ED5-8`), TOTAL - NRP - `ED0-2` - `ED3-4`, `ED5-8`)) %>%
  gather(isced11, values, 6:10) %>%
  filter(isced11 %in% c("ED0-2", "ED3-4", "ED5-8")) %>%
  mutate(educ3 = plyr::mapvalues(isced11, c("ED0-2", "ED3-4", "ED5-8"), c(1,2,3))) %>%
  select(geo, time, age, sex, educ3, values) %>%
  group_by(geo, time, age, sex, educ3) %>%
  summarise(nobs_cat = sum(values)) %>%
  ungroup() %>%
  spread(sex, nobs_cat) %>%
  mutate(M = ifelse(is.na(M), T - F, M),
         F = ifelse(is.na(F), T - M, F)) %>%
  gather(sex, nobs_cat, 5:7) %>%
  filter(sex != "T") %>%
  spread(age, nobs_cat) %>%
  mutate(`Y35-39` = ifelse(is.na(`Y35-39`), `Y25-39` - `Y25-29` - `Y30-34`, `Y35-39`),
         `Y40-44` = ifelse(is.na(`Y40-44`), `Y40-59` - `Y45-49` - `Y50-59`, `Y40-44`),
         `Y70-74` = `Y50-74`-(`Y50-54`+`Y55-64`+`Y65-69`),
         `Y20-34` = `Y20-24`+`Y25-29`+`Y30-34`,
         `Y35-54` = `Y35-39`+`Y40-44`+`Y45-49`+`Y50-54`,
         `Y55-74` = `Y55-64`+`Y65-69` + `Y70-74`) %>%
  select(geo, time, sex, educ3, `Y20-34`, `Y35-54`, `Y55-74`) %>%
  gather(age_cat, nobs_cat, 5:7) %>%
  mutate(source = "Eurostat",
         age_cat = plyr::mapvalues(age_cat, c("Y20-34", "Y35-54", "Y55-74"),
                                   c("(19,34]", "(34,54]", "(54,74]")))


## 3. Combining (1) and (2) -------------

edu_dem <- left_join(edu, dem) %>%
  mutate(prop_cat = nobs_cat * 1000 / npop) %>%
  select(-nobs_cat, -npop) %>%
  group_by(geo, time, sex, age_cat) %>%
  mutate(prop_cat = prop_cat / sum(prop_cat)) %>%
  ungroup()

saveRDS(edu_dem, "3_imputation_demographics/edu_dem_20201020.rds")
edu_dem <- readRDS("3_imputation_demographics/edu_dem_20201020.rds")


# sample:
# education proportions by age and sex in Germany, Spain, and Poland
edu_dem %>%
  filter(sex == "M",
         geo %in% c("PL", "ES", "DE")) %>%
  ggplot(., aes(x = time, y = prop_cat, col = educ3, group = educ3)) +
  geom_line(size = 1) +
  theme_minimal() +
  facet_grid(age_cat ~ geo)


## 4. Cleaning IPUMS data -------------

# population data extracts from IPUMS International
# https://international.ipums.org/international/
# for the countries analyzed, and for variables:
# AGE, AGE2, AGE3, SEX, EDATTAIN, EDATTAIND

# Run on cluster:
# 
# library(ipumsr)
#
# ddi <- read_ipums_ddi("ipums/ipumsi_00002.xml")
# data <- read_ipums_micro(ddi, verbose = FALSE)
# 
# names(data)
# # [1] "COUNTRY"   "YEAR"      "SAMPLE"    "SERIAL"    "HHWT"      "PERNUM"
# # [7] "PERWT"     "RESIDENT"  "AGE"       "AGE2"      "AGE3"      "SEX"
# # [13] "EDATTAIN"  "EDATTAIND"
#
# 
# cat <- data %>% 
#   group_by(COUNTRY, YEAR, SAMPLE, AGE, AGE2, AGE3, SEX, EDATTAIN, EDATTAIND) %>% 
#   summarise(nw = sum(PERWT), 
#             nobs = n())
#
# saveRDS(cat, "3_imputation_demographics/IPUMS_cat.rds")

cat <- readRDS("3_imputation_demographics/IPUMS_cat.rds")

ipums <- cat %>%
  remove_all_labels() %>%
  ungroup() %>%
  filter(AGE >= 20 & AGE <= 74,
         SEX != 9,
         YEAR < 2000, # after 2000 data from Eurostat are available
         COUNTRY != "756") %>% # exclude Switzerland for useless education coding
  mutate(educ = plyr::mapvalues(EDATTAIN, c(0,1,2,3,4,9), c(NA, 1, 1, 2, 3, NA)),
         # recode education levels into the target binary indicator
         educd = plyr::mapvalues(EDATTAIND, 
                                 c(0,100,110,120,130,211,212,221,222,311,312,321,322,400,999,NA),
                                 c(NA, 1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  3, NA,NA)),
         educd_pl = ifelse(COUNTRY == 616, # education coding for Poland requires adjustment
                           plyr::mapvalues(EDATTAIND,
                                           c(0,100,110,120,130,211,212,221,222,311,312,321,322,400,999,NA),
                                           c(NA, 1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  3, NA,NA)),
                           NA),
         educ3 = ifelse(COUNTRY == 616, educd_pl, educd) %>% as.character(),
         age_cat = cut(AGE, c(19, 34, 54, 74), right = T),
         # switch from 3-character to 2-character country codes
         cntry = countrycode(COUNTRY, "iso3n", "iso2c"),
         sex = plyr::mapvalues(SEX, c(1,2), c("M", "F"))) %>%
  group_by(COUNTRY, cntry, YEAR, SAMPLE, sex, age_cat, educ3) %>%
  drop_na(educ3) %>%
  summarise(nobs = sum(nobs)) %>%
  filter(cntry %in% countries) %>%
  group_by(cntry, YEAR, sex, age_cat) %>%
  mutate(prop_cat = nobs / sum(nobs)) %>%
  select(geo = cntry, time = YEAR, sex, age_cat, educ3, prop_cat) %>%
  mutate(source = "IPUMS") %>%
  ungroup()


## 5. Combining Eurostat data (3) and IPUMS (4) -------------

eurostat_ipums <- edu_dem %>% bind_rows(ipums)

saveRDS(eurostat_ipums, "3_imputation_demographics/eurostat_ipums_20201020.rds")
# serves as input for imputation-model.R

