## Sweep 8 clean and recode function

sw8clean <- function(root) {
  
## load required packages
  
library(dplyr)

## data
teacher <- read.delim(file.path(root, "UKDA-8366-tab/tab/gus_bc1_p6_teacher_protect.tab"))
sw8 <- read.delim(file.path(root, "UKDA-5760-tab/tab/gus_cohort1_sw8_b_protect.tab"))

sw8 <- sw8 %>%
  select(Idnumber,
         DhHGrsp04,
         contains("MhSDQ"),
         DhSf12mn,
         DhPSU,
         DhStrat,
         DhZten02,
         DhMedu02,
         DhWTbrth,
         MhHGsx1,
         MhWmnf01,
  )

teacher <- teacher %>%
  select(Idnumber,
         contains("ThSDQ"),
         ThWTbrth,
         ThOutcome)

## Merging household and teacher datasets
data <- merge(teacher, sw8, by = "Idnumber")

# setting missing data to NA
data[data < 0] <- NA

# setting NA for answers of "4" (Don't know) for maternal SDQ responses
for (col in colnames(data)) {
  if (grepl("MhSDQ", col)) {
    boolean_mask = (data[col] == 4)
    data[col][boolean_mask] <- NA
  }
}

# Correcting reverse-scored items

data <- data %>%
  mutate(ThSDQ07 = case_when(ThSDQ07 == 1 ~ 3,
                             ThSDQ07 == 2 ~ 2,
                             ThSDQ07 == 3 ~ 1,
                             ThSDQ07 == NA ~ NA))
data <- data %>%
  mutate(ThSDQ11 = case_when(ThSDQ11 == 1 ~ 3,
                             ThSDQ11 == 2 ~ 2,
                             ThSDQ11 == 3 ~ 1,
                             ThSDQ11 == NA ~ NA))
data <- data %>%
  mutate(ThSDQ14 = case_when(ThSDQ14 == 1 ~ 3,
                             ThSDQ14 == 2 ~ 2,
                             ThSDQ14 == 3 ~ 1,
                             ThSDQ14 == NA ~ NA))
data <- data %>%
  mutate(ThSDQ21 = case_when(ThSDQ21 == 1 ~ 3,
                             ThSDQ21 == 2 ~ 2,
                             ThSDQ21 == 3 ~ 1,
                             ThSDQ21 == NA ~ NA))
data <- data %>%
  mutate(ThSDQ25 = case_when(ThSDQ25 == 1 ~ 3,
                             ThSDQ25 == 2 ~ 2,
                             ThSDQ25 == 3 ~ 1,
                             ThSDQ25 == NA ~ NA))

data <- data %>%
  mutate(MhSDQ07 = case_when(MhSDQ07 == 1 ~ 3,
                             MhSDQ07 == 2 ~ 2,
                             MhSDQ07 == 3 ~ 1,
                             MhSDQ07 == NA ~ NA))
data <- data %>%
  mutate(MhSDQ11 = case_when(MhSDQ11 == 1 ~ 3,
                             MhSDQ11 == 2 ~ 2,
                             MhSDQ11 == 3 ~ 1,
                             MhSDQ11 == NA ~ NA))
data <- data %>%
  mutate(MhSDQ14 = case_when(MhSDQ14 == 1 ~ 3,
                             MhSDQ14 == 2 ~ 2,
                             MhSDQ14 == 3 ~ 1,
                             MhSDQ14 == NA ~ NA))
data <- data %>%
  mutate(MhSDQ21 = case_when(MhSDQ21 == 1 ~ 3,
                             MhSDQ21 == 2 ~ 2,
                             MhSDQ21 == 3 ~ 1,
                             MhSDQ21 == NA ~ NA))
data <- data %>%
  mutate(MhSDQ25 = case_when(MhSDQ25 == 1 ~ 3,
                             MhSDQ25 == 2 ~ 2,
                             MhSDQ25 == 3 ~ 1,
                             MhSDQ25 == NA ~ NA))

# Predictive variables
## dichotomise MCS-12 1 sd from the mean. Reporters above the mean reflect better mental health status than the general US population
# 1 sd below average mental health
data <- data %>%
  mutate(mcs12 = case_when(DhSf12mn < mean(sw8$DhSf12mn) - sd(sw8$DhSf12mn) ~ 0,
                           DhSf12mn >= mean(sw8$DhSf12mn) - sd(sw8$DhSf12mn) ~ 1,
                           DhSf12mn == NA ~ NA))

## Child biological sex
data <- data %>%
  mutate(sex = case_when(MhHGsx1 == 1 ~ 0, #boy
                         MhHGsx1 == 2 ~ 1, #girl
                         MhHGsx1 == NA ~ NA))

## household tenancy
data <- data %>%
  mutate(tenancy = case_when(DhZten02 == 1 ~ 1, # own home
                             DhZten02 == 2 ~ 0, # rent home
                             DhZten02 == 3 ~ 0, # rent home
                             DhZten02 == 4 ~ NA, # other
                             DhZten02 == NA ~ NA))
## Maternal education
data <- data %>%
  mutate(educat = case_when(DhMedu02 == 1 ~ 1, #higher education
                            DhMedu02 == 2 ~ 0, #below
                            DhMedu02 == 3 ~ 0, #below
                            DhMedu02 == NA ~ NA))

# family configuration
data <- data %>%
  mutate(family = case_when(DhHGrsp04 == 0 ~ 0, #lone parent
                            DhHGrsp04 == 1 ~ 1, #couple household
                            DhHGrsp04 == NA ~ NA))

## Subjective household financial stability
data <- data %>%
  mutate(finance = case_when(MhWmnf01 <= 2 ~ 1, #Manage well
                             MhWmnf01 >= 3 ~ 0, #Not managing well financially
                             MhWmnf01 == NA ~ NA))


## Setting data frame for weighted analysis
data <- data[complete.cases(data[ , c('DhWTbrth')]), ] # removes observations where weight is missing
  

## return cleaned data frame
  
return(data)
  
}
