
# Data manipulation, exclude outcome invalid and create 

ESS.no.outinval <- ESS 

ESS.factors <- ESS %>%
  filter(is.na(outinval)) %>% 
  mutate_if(is.labelled, to_factor)  %>% 
  mutate(gndr_su = fct_drop(gndr_su),
         interview_fct = as.factor(interview),
         interview_fct = if_else(interview_fct == "TRUE", "Interview", "No interview"))  

# Exclude rounds with large % of missing data


ESS.na.20 <- ESS.factors %>% 
  group_by(cntry.round) %>% 
  mutate(n = n(),
         interview.sum = sum(interview),
         interva789 = interva == "Blocked cases or sample unit refusal before first contact" | 
           interva == "No contact made - error of field organization" | 
           interva == "Not available",
         interview_na = ifelse(interva789 == T | is.na(interva), TRUE, FALSE),
         int_sum = sum(interview_na),
         interva.na.prop = int_sum / n,
         gndr.na = sum(is.na(gndr_su)),
         gndr.na.prop = gndr.na / n,
         age.na = sum(is.na(age50)),
         age.na.prop = age.na / n,
         multi.unit.na = sum(is.na(multi.unit)),
         multi.unit.na.prop = multi.unit.na / n,
         neighb.na = sum(is.na(neighb)),
         neighb.na.prop = neighb.na / n,
         sum.na = gndr.na + age.na + neighb.na + multi.unit.na,
         prop.na = sum.na / n) %>% 
  ungroup() %>% 
  filter(interva.na.prop < 0.02)  %>% 
  filter(multi.unit.na.prop < 0.2)  %>% 
  filter(neighb.na.prop < 0.2)  %>% 
  filter(cntry.round != "CH-8") %>%  # Entry errors in interviewer observations data  
  filter(cntry.round != "CZ-5") # Only round with large numbers in "No interview because of opt out list" which causes outliers in R indicators

# Exclude rounds with large % of missing data


ESS.na.05 <- ESS.factors %>% 
  group_by(cntry.round) %>% 
  mutate(n = n(),
         interview.sum = sum(interview),
         interva789 = interva == "Blocked cases or sample unit refusal before first contact" | 
           interva == "No contact made - error of field organization" | 
           interva == "Not available",
         interview_na = ifelse(interva789 == T | is.na(interva), TRUE, FALSE),
         int_sum = sum(interview_na),
         interva.na.prop = int_sum / n,
         gndr.na = sum(is.na(gndr_su)),
         gndr.na.prop = gndr.na / n,
         age.na = sum(is.na(age50)),
         age.na.prop = age.na / n,
         multi.unit.na = sum(is.na(multi.unit)),
         multi.unit.na.prop = multi.unit.na / n,
         neighb.na = sum(is.na(neighb)),
         neighb.na.prop = neighb.na / n,
         sum.na = gndr.na + age.na + neighb.na + multi.unit.na,
         prop.na = sum.na / n) %>% 
  ungroup() %>% 
  filter(interva.na.prop < 0.05)  %>% 
  filter(multi.unit.na.prop < 0.05)  %>% 
  filter(neighb.na.prop < 0.2)  %>% 
  filter(cntry.round != "CH-8") 



# Which country-rounds are excluded to to missing on interva

na.prop.interva <- ESS.factors %>% 
  group_by(cntry.round) %>% 
  mutate(n = n(),
         interview.sum = sum(interview),
         interva789 = interva == "Blocked cases or sample unit refusal before first contact" | 
           interva == "No contact made - error of field organization" | 
           interva == "Not available",
         interview_na = ifelse(interva789 == T | is.na(interva), TRUE, FALSE),
         int_sum = sum(interview_na),
         interva.na.prop = int_sum / n,
         gndr.na = sum(is.na(gndr_su)),
         gndr.na.prop = gndr.na / n,
         age.na = sum(is.na(age50)),
         age.na.prop = age.na / n,
         multi.unit.na = sum(is.na(multi.unit)),
         multi.unit.na.prop = multi.unit.na / n,
         neighb.na = sum(is.na(neighb)),
         neighb.na.prop = neighb.na / n,
         sum.na = gndr.na + age.na + neighb.na + multi.unit.na,
         prop.na = sum.na / n) %>% 
  ungroup() %>% 
  filter(interva.na.prop > 0.02)


(unique(na.prop.interva$cntry.round))

ESS.int.obs <- ESS.na.20 %>% 
  filter(!is.na(interview)) %>% 
  filter(!is.na(multi.unit)) %>% 
  filter(!is.na(neighb)) %>% 
  filter(!is.na(typesamp2)) %>% 
  mutate(round_fct = as.factor(essround))  %>% 
  select(cntry, cntry.full, essround, round, n, round_fct, cntry.round, typesamp2,
         interview, gndr_su, age50, age_cat,  telnum, telephone.present, vandalism, 
         physa.2, multi.unit, litter, neighb, no.access, interview_fct)

ESS.int.obs$cntry <- droplevels(ESS.int.obs$cntry)
ESS.int.obs$cntry.round <- droplevels(as.factor(ESS.int.obs$cntry.round))
ESS.int.obs$cntry.full <- droplevels(as.factor(ESS.int.obs$cntry.full))



ESS.count.na <- ESS.factors   %>% 
  group_by(cntry, essround)   %>% 
  count(name = "na.included") %>% 
  print(n = 55)

ESS.count.no.na <- ESS.int.obs %>% 
  group_by(cntry, essround)   %>% 
  count(name = "na.excluded") %>% 
  print(n = 55)

left_join(ESS.count.na, ESS.count.no.na) %>%  
  mutate(prop.excluded = 1 - (na.excluded/na.included)) %>% 
  arrange(prop.excluded) %>% 
  print(n = 55) 

# Plot which country-rounds are included - Figure 1


sample.size <- ESS %>%
  select(cntry.full, cntry, round, cntry.round, interview, interview_na, gndr_su, age50, 
         neighb, multi.unit) %>%
  group_by(cntry, round) %>%
  summarize(n = n(),
            interview.sum = sum(interview_na),
            interva.na.prop = interview.sum / n,
            gndr.na = sum(is.na(gndr_su)),
            gndr.na.prop = gndr.na / n,
            age.na = sum(is.na(age50)),
            age.na.prop = age.na / n,
            multi.unit.na = sum(is.na(multi.unit)),
            multi.unit.na.prop = multi.unit.na / n,
            neighb.na = sum(is.na(neighb)),
            neighb.na.prop = neighb.na / n,
            sum.na = gndr.na + age.na + neighb.na + multi.unit.na,
            prop.na = sum.na / n,
            cntry.full = cntry.full,
            cntry.round = cntry.round)

allrounds <- ESS %>%
  tidyr::expand(cntry.full, round)


# Which country rounds fail to meet eligibility criteria?


ESS |> 
  group_by(cntry.round) %>% 
  summarize(n = n(),
            multi.unit.na = sum(is.na(multi.unit)),
            multi.unit.na.prop = multi.unit.na / n,
            neighb.na = sum(is.na(neighb)),
            neighb.na.prop = neighb.na / n,
            interview.sum = sum(interview_na),
            interva.na.prop = interview.sum / n,
  ) |> 
  arrange(desc(neighb.na.prop)) |> 
  filter(neighb.na.prop > 0.05 | multi.unit.na.prop > 0.05 | interva.na.prop > 0.02) |> 
  print(n = Inf)


ESS |> 
  filter(essround > 5) |> 
  filter(typesamp2 == "Individual person") |> 
    group_by(cntry.round) %>% 
  summarize(n = n(),
            gndr.na = sum(is.na(gndr_su)),
            gndr.na.prop = gndr.na / n,
            age.na = sum(is.na(age_cat)),
            age.na.prop = age.na / n,
            multi.unit.na = sum(is.na(multi.unit)),
            multi.unit.na.prop = multi.unit.na / n,
            neighb.na = sum(is.na(neighb)),
            neighb.na.prop = neighb.na / n,
            interview.sum = sum(interview_na),
            interva.na.prop = interview.sum / n,
  ) |> 
  arrange(desc(age.na.prop)) |> 
  filter(age.na.prop < 0.2 & neighb.na.prop < 0.2 & interva.na.prop < 0.02) |> 
  print(n = Inf)

# Which country-rounds are analysed?

analysed <- allrounds %>%
  full_join(sample.size,
            by = c("cntry.full", "round")) %>%
  replace_na(list(n = 0)) %>%
  mutate(cntry.round = str_c(cntry, round, sep = "-"),
         missing = case_when(multi.unit.na.prop > 0.2 ~ TRUE,
                             neighb.na.prop > 0.2 ~ TRUE,
                             interva.na.prop > 0.02 ~ TRUE,
                             TRUE ~ FALSE),
         analysed =  case_when(missing == TRUE ~ "Missing data",
                               cntry.round == "CH-8" ~ "Missing data", # data entry errors
                               cntry.round == "CZ-5" ~ "Missing data", # affected by the number of people in category "No interview because of opt out list"
                               cntry.round == "CZ-7" ~ "Interviewer observations", # Age missing levels too high
                               cntry.round == "CZ-8" ~ "Interviewer observations", # Age missing levels too high
                               cntry.round == "CZ-9" ~ "Interviewer observations", # Age missing levels too high
                               cntry.round == "CZ-10" ~ "Interviewer observations", # Age missing levels too high
                               cntry.round == "AT-7" ~ "Interviewer observations", # Age missing levels too high
                               cntry.round == "HU-7" ~ "Interviewer observations", # Age missing levels too high
                               cntry.round == "HU-8" ~ "Interviewer observations", # Age missing levels too high
                               cntry.round == "HU-9" ~ "Interviewer observations", # Age missing levels too high
                               cntry.round == "HU-10" ~ "Interviewer observations", # Age missing levels too high
                               n > 0 & gndr.na.prop < 1 ~ "Interviewer observations and Demographics",
                               n > 0 ~ "Interviewer observations",
                               TRUE ~ "No data/did not participate/not published"),
         analysed = fct_relevel(analysed, "Interviewer observations and Demographics")) 




# Create reduced dataset of country-rounds which include interviewer observations and demographic variables

ESS.full <- ESS.int.obs |> 
  group_by(cntry.round) %>% 
  mutate(n = n(),
         gndr.na = sum(is.na(gndr_su)),
         gndr.na.prop = gndr.na / n,
         age.na = sum(is.na(age_cat)),
         age.na.prop = age.na / n
  ) |> 
  filter(gndr.na.prop < 0.2)  %>% 
  filter(age.na.prop < 0.2)  |> 
  filter(!is.na(gndr_su)) |> 
  filter(!is.na(age_cat)) |> 
  filter(typesamp2 == "Individual person")


saveRDS(ESS.full, "Data/ESS_full.Rdata")
saveRDS(ESS.int.obs, "Data/ESS_int_obs.Rdata")





