## Code for replication

## Nolan, LB. "An Exploration of Proxy- and Self-Reported Adolescent 
##                     Health in Low-Resource Settings"
## For publication in the journal Survey Research Methods

## Data from the Young Lives Study of International Child Poverty is publicly available
  ## at the UK Data Archive: http://www.data-archive.ac.uk. See http://www.younglives.org.uk
  ## for more information.

library(foreign)
library(ppcor)
library(plyr)
library(reshape2)
library(ggplot2)
library(MASS)
library(stargazer)
library(car)
library(psych)
library(mi)
library(coefplot)
library(pscl)
library(caret)
library(nnet)
library(systemfit)
library(car)
library(polycor)


###### Setting up the data ######

## Wave 2
## Ethiopia - 2
etchildlevel5yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/ethiopia_r2/etchildlevel5yrold.dta", convert.factors = FALSE)
colnames(etchildlevel5yrold) <- tolower(colnames(etchildlevel5yrold))
etchildlevel12yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/ethiopia_r2/etchildlevel12yrold.dta", convert.factors = FALSE)
colnames(etchildlevel12yrold) <- tolower(colnames(etchildlevel12yrold))
## Whether the child is in school from the child questionnaire:
etchildquest12yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/ethiopia_r2/etchildquest12yrold.dta", convert.factors = FALSE) 
colnames(etchildquest12yrold) <- tolower(colnames(etchildquest12yrold))
etchildquest12yrold <- subset(etchildquest12yrold, select = c(childid,enrsch))
## Merging this to the main data set for older children...
etchildlevel12yrold <- merge(etchildlevel12yrold, etchildquest12yrold, by = "childid")

## India - 2
inchildlevel5yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/india_r2/inchildlevel5yrold.dta", convert.factors = FALSE)
colnames(inchildlevel5yrold) <- tolower(colnames(inchildlevel5yrold))
inchildlevel12yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/india_r2/inchildlevel12yrold.dta", convert.factors = FALSE)
colnames(inchildlevel12yrold) <- tolower(colnames(inchildlevel12yrold))
## Whether the child is in school from the child questionnaire
inchildquest12yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/india_r2/inchildquest12yrold.dta", convert.factors = FALSE)
colnames(inchildquest12yrold) <- tolower(colnames(inchildquest12yrold))
inchildquest12yrold <- subset(inchildquest12yrold, select = c(childid, enrsch))
## Merging this to the main data set for older children
inchildlevel12yrold <- merge(inchildlevel12yrold, inchildquest12yrold, by = "childid")

## Peru - 2
pechildlevel5yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/peru_r2/pechildlevel5yrold.dta", convert.factors = FALSE)
colnames(pechildlevel5yrold) <- tolower(colnames(pechildlevel5yrold))
pechildlevel12yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/peru_r2/pechildlevel12yrold.dta", convert.factors = FALSE)
colnames(pechildlevel12yrold) <- tolower(colnames(pechildlevel12yrold))
## Whether the child is in school from the child questionnaire
pechildquest12yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/peru_r2/pechildquest12yrold.dta", convert.factors = FALSE) 
colnames(pechildquest12yrold) <- tolower(colnames(pechildquest12yrold))
pechildquest12yrold <- subset(pechildquest12yrold, select = c(childid, enrsch))
## Merging this to the main data set for older children
pechildlevel12yrold <- merge(pechildlevel12yrold, pechildquest12yrold, by = "childid")

## Vietnam - 2
vnchildlevel5yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/vietnam_r2/vnchildlevel5yrold.dta", convert.factors = FALSE)
colnames(vnchildlevel5yrold) <- tolower(colnames(vnchildlevel5yrold))
vnchildlevel12yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/vietnam_r2/vnchildlevel12yrold.dta", convert.factors = FALSE)
colnames(vnchildlevel12yrold) <- tolower(colnames(vnchildlevel12yrold))
## Whether the child is in school and hfa from the child questionnaire
vnchildquest12yrold <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/vietnam_r2/vnchildquest12yrold.dta", convert.factors = FALSE)
colnames(vnchildquest12yrold) <- tolower(colnames(vnchildquest12yrold))
vnchildquest12yrold <- subset(vnchildquest12yrold, select = c(childid, enrsch))
## Merging this to the main data set for older children
vnchildlevel12yrold <- merge(vnchildlevel12yrold, vnchildquest12yrold, by = "childid")

## Combining
Ethiopia.Survey2 <- rbind.fill(etchildlevel5yrold, etchildlevel12yrold)
Ethiopia.Survey2$wave <- rep(2, length(nrow(Ethiopia.Survey2)))
Ethiopia.Survey2$country <- rep("ethiopia", nrow(Ethiopia.Survey2))

India.Survey2 <- rbind.fill(inchildlevel5yrold, inchildlevel12yrold)
India.Survey2$wave <- rep(2, length(nrow(India.Survey2)))
India.Survey2$country <- rep("india", nrow(India.Survey2))

Peru.Survey2 <- rbind.fill(pechildlevel5yrold, pechildlevel12yrold)
Peru.Survey2$wave <- rep(2, length(nrow(Peru.Survey2)))
Peru.Survey2$country <- rep("peru", nrow(Peru.Survey2))

Vietnam.Survey2 <- rbind.fill(vnchildlevel5yrold, vnchildlevel12yrold)
Vietnam.Survey2$wave <- rep(2, length(nrow(Vietnam.Survey2)))
Vietnam.Survey2$country <- rep("vietnam", nrow(Vietnam.Survey2))

wave2 <- rbind.fill(Ethiopia.Survey2, India.Survey2, Peru.Survey2, Vietnam.Survey2)

varnames2 <- names(wave2)
class(varnames2)
varnames <- sub('r2','',varnames2)
names(wave2) <- varnames

table(wave2$mvdtyp) # it worked!

## Wave 3
## Ethiopia - 3 (anthropometrics are seperate from rest of info)
et_yc_childlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/ethiopia_r3/youngerchild/et_yc_childlevel.dta", convert.factors = FALSE)
colnames(et_yc_childlevel) <- tolower(colnames(et_yc_childlevel))
et_yc_householdlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/ethiopia_r3/youngerchild/et_yc_householdlevel.dta", convert.factors = FALSE)
colnames(et_yc_householdlevel) <- tolower(colnames(et_yc_householdlevel))
et_yc_main_young <- merge(et_yc_childlevel, et_yc_householdlevel, by = "childid")
et_oc_childlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/ethiopia_r3/olderchild/et_oc_childlevel.dta", convert.factors = FALSE)
colnames(et_oc_childlevel) <- tolower(colnames(et_oc_childlevel))
et_oc_householdlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/ethiopia_r3/olderchild/et_oc_householdlevel.dta", convert.factors = FALSE)
colnames(et_oc_householdlevel) <- tolower(colnames(et_oc_householdlevel))
et_oc_main_old <- merge(et_oc_childlevel, et_oc_householdlevel, by = "childid")

## India - 3
in_yc_childlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/india_r3/youngerchild/in_yc_childlevel.dta", convert.factors = FALSE)
colnames(in_yc_childlevel) <- tolower(colnames(in_yc_childlevel))
in_yc_householdlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/india_r3/youngerchild/in_yc_householdlevel.dta", convert.factors = FALSE)
colnames(in_yc_householdlevel) <- tolower(colnames(in_yc_householdlevel))
in_yc_main <- merge(in_yc_childlevel, in_yc_householdlevel, by = "childid")
in_oc_childlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/india_r3/olderchild/in_oc_childlevel.dta", convert.factors = FALSE)
colnames(in_oc_childlevel) <- tolower(colnames(in_oc_childlevel))
in_oc_householdlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/india_r3/olderchild/in_oc_householdlevel.dta", convert.factors = FALSE)
colnames(in_oc_householdlevel) <- tolower(colnames(in_oc_householdlevel))
in_oc_main <- merge(in_oc_childlevel, in_oc_householdlevel, by = "childid")

## Peru - 3
pe_yc_childlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/peru_r3/youngerchild/pe_yc_childlevel.dta", convert.factors = FALSE)
colnames(pe_yc_childlevel) <- tolower(colnames(pe_yc_childlevel))
pe_yc_householdlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/peru_r3/youngerchild/pe_yc_householdlevel.dta", convert.factors = FALSE)
colnames(pe_yc_householdlevel) <- tolower(colnames(pe_yc_householdlevel))
pe_yc_main <- merge(pe_yc_childlevel, pe_yc_householdlevel, by = "childid")
pe_oc_childlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/peru_r3/olderchild/pe_oc_childlevel.dta", convert.factors = FALSE)
colnames(pe_oc_childlevel) <- tolower(colnames(pe_oc_childlevel))
pe_oc_householdlevel <-read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/peru_r3/olderchild/pe_oc_householdlevel.dta", convert.factors = FALSE)
colnames(pe_oc_householdlevel) <- tolower(colnames(pe_oc_householdlevel))
pe_oc_main <- merge(pe_oc_childlevel, pe_oc_householdlevel , by = "childid")

## Vietnam - 3
vn_yc_childlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/vietnam_r3/youngerchild/vn_yc_childlevel.dta", convert.factors = FALSE)
colnames(vn_yc_childlevel) <- tolower(colnames(vn_yc_childlevel))
vn_yc_householdlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/vietnam_r3/youngerchild/vn_yc_householdlevel.dta", convert.factors = FALSE)
colnames(vn_yc_householdlevel) <- tolower(colnames(vn_yc_householdlevel))
vn_yc_main <- merge(vn_yc_childlevel, vn_yc_householdlevel, by = "childid")
vn_oc_childlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/vietnam_r3/olderchild/vn_oc_childlevel.dta", convert.factors = FALSE)
colnames(vn_oc_childlevel) <- tolower(colnames(vn_oc_childlevel))
vn_oc_householdlevel <- read.dta(file = "Round_3_2009/UKDA-6853-stata11/stata11/survey_data_r3/vietnam_r3/olderchild/vn_oc_householdlevel.dta", convert.factors = FALSE)
colnames(vn_oc_householdlevel) <- tolower(colnames(vn_oc_householdlevel))
vn_oc_main <- merge(vn_oc_childlevel, vn_oc_householdlevel, by = "childid")

## Combining

## ETHIOPIA
Ethiopia.Survey3 <- rbind.fill(et_yc_main_young, et_oc_main_old)
Ethiopia.Survey3$wave <- rep(3, length(nrow(Ethiopia.Survey3)))
Ethiopia.Survey3$country <- rep("ethiopia", length(nrow(Ethiopia.Survey3)))
## dealing with class issues
eth_lengthclass <- sapply(lapply(Ethiopia.Survey3,class), length)
which(eth_lengthclass != 1)
Ethiopia.Survey3$dbrchr32 <- NULL
Ethiopia.Survey3$tmoninr3 <- NULL

## INDIA
## FIRST dealing with class issues
in_lengthclass <- sapply(lapply(in_yc_main, class), length)
which(in_lengthclass != 1)
in_yc_main$tmoninr3 <- NULL
India.Survey3 <- rbind.fill(in_yc_main, in_oc_main)
India.Survey3$wave <- rep(3, length(nrow(India.Survey3)))
India.Survey3$country <- rep("india", length(nrow(India.Survey3)))

## PERU
Peru.Survey3 <- rbind.fill(pe_yc_main, pe_oc_main)
Peru.Survey3$wave <- rep(3, length(nrow(Peru.Survey3)))
Peru.Survey3$country <- rep("peru", length(nrow(Peru.Survey3)))
pu_lengthclass <- sapply(lapply(Peru.Survey3, class), length)
which(pu_lengthclass != 1) # no class issues

## VIETNAM
Vietnam.Survey3 <- rbind.fill(vn_yc_main, vn_oc_main)
Vietnam.Survey3$wave <- rep(3, length(nrow(Vietnam.Survey3)))
Vietnam.Survey3$country <- rep("vietnam", length(nrow(Vietnam.Survey3)))
vm_lengthclass <- sapply(lapply(Vietnam.Survey3, class), length)
which(pu_lengthclass != 1) # no class issues

wave3 <- rbind.fill(Ethiopia.Survey3, India.Survey3, Peru.Survey3, Vietnam.Survey3)

varnames3 <- names(wave3)
class(varnames3)
varnames <- sub('r3','',varnames3)
names(wave3) <- varnames

## saving the data
# save(wave1, file = "newwave1.RDa")
# save(wave2, file = "newwave2.RDa")
# save(wave3, file = "newwave3.RDa")


##### Recoding variables #####

# WAVE 2: healthy = adult self-report (1-same, 2-better, 3-worse), 
# chealthy = child self-report (1-same, 2-better, 3-worse)
load("newwave2.Rda")
selfrep <- wave2[c("childid", "id1", "clustid", "zhfa", "healthy", "country",
                   "mightdie", "longterm", "sex", "careed",
                   "disab01", "disab02", "disab03", "disab04", "disab05", "disab06",
                   "disab07", "disab08", "wi",
                   "foodshrt", "agechild")]

nrow(selfrep)

## obtaining just the older cohort
selfrep <- selfrep[selfrep$agechild > 125,]
nrow(selfrep) # 3734

## Pulling in the "child questionnaire"

# Ethiopia
childe <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/ethiopia_r2/etchildquest12yrold.dta", convert.factors = FALSE)
childe <- childe[c("CHILDID", "CHEALTHY", "MENARCHE", "VOICE")]
# India
childi <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/india_r2/inchildquest12yrold.dta", convert.factors = FALSE)
childi <- childi[c("CHILDID", "CHEALTHY", "MENARCHE", "VOICE")]
# Peru
childp <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/peru_r2/pechildquest12yrold.dta", convert.factors = FALSE)
childp <- childp[c("CHILDID", "CHEALTHY", "MENARCHE", "VOICE")]
# Vietnam
childv <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/vietnam_r2/vnchildquest12yrold.dta", convert.factors = FALSE)
childv <- childv[c("CHILDID", "CHEALTHY", "MENARCHE", "VOICE")]

child <- rbind(childe, childi, childp, childv)
child <- rename(child, c(CHILDID = "childid", CHEALTHY = "chealthy"))
nrow(child) # 3658 

## merging
health <- merge(selfrep, child, by = "childid")
nrow(health) # 3645

## 89 adolescents are missing "child questionnaire" responses

## setting implausible chealthy and healthy to missing:
table(health$chealthy, useNA = "ifany")
health$chealthy <- ifelse(health$chealthy == 0 | health$chealthy == 77 |
                            health$chealthy == 88 | health$chealthy == 99,
                          NA, health$chealthy)
table(health$healthy, useNA = "ifany")
health$healthy <- ifelse(health$healthy == 77,
                         NA, health$healthy)

## How many missing on the outcome variable?
table(health$healthy, useNA = "ifany")
table(health$chealthy, useNA = "ifany")

## setting implausible parent's education to missing:
## careed is coded 0-none, 1-at least some completed primary, 
## 3-completed anything above primary
health$careed <- ifelse(is.na(health$careed) == TRUE | health$careed == 99 | health$careed == 88,
                        NA, ifelse(health$careed == 0,
                                   0,
                                   ifelse(health$careed <= 8,
                                          1, 2)))
## renaming careed as pschool to make the varname more easily understandable
health <- rename(health, c(careed = "pschool"))

## setting implausible mightdie - 1 being health issue, 0 being no health issue - to missing
table(health$mightdie, useNA = "ifany")
health$mightdie <- ifelse(health$mightdie == 77, NA, health$mightdie)

## setting implausible longterm to missing
table(health$longterm, useNA = "ifany")
health$longterm <- ifelse(health$longterm == 99, NA, health$longterm)

## creating a variable for any disability: 
## any disability sum over 8 should get 1, all kids with 8 get 0 (no disability)
health$anydis <- ifelse(is.na(health$disab01) == TRUE | is.na(health$disab02) == TRUE | is.na(health$disab03) == TRUE |
                          is.na(health$disab04) == TRUE | is.na(health$disab05) == TRUE | is.na(health$disab06) == TRUE |
                          is.na(health$disab07) == TRUE | is.na(health$disab08) == TRUE,
                        NA, health$disab01 + health$disab02 + health$disab03 + health$disab04 + health$disab05 + health$disab06 +
                          health$disab07 + health$disab08)
health$anydis <- ifelse(health$anydis >30, # >=9 indicates having one or more diability
                        NA, ifelse(health$anydis >= 9 & health$anydis < 30,
                                   1, 0))

## setting implausible height for age (i.e. < -6 and > 6) to NA
health$zhfa <- ifelse(health$zhfa > 6 | health$zhfa < -6, NA, health$zhfa)

table(health$foodshrt, useNA = "ifany")

## indicator variable for whether the child is stunted
health$stunted <- ifelse(health$zhfa < -2, 1, 0)

## creating an indicator for puberty
table(health$MENARCHE, useNA = "ifany") # menarche
health$MENARCHE <- ifelse(health$MENARCHE == 77 | health$MENARCHE == 88 | health$MENARCHE == 99, NA,
                          ifelse(health$MENARCHE > 0, 1, 0))
health <- rename(health, c(MENARCHE = "menarche"))
table(health$menarche, useNA = "ifany") # lots of missings bc this is only for females

table(health$VOICE, useNA = "ifany") # voice
health$VOICE <- ifelse(health$VOICE == -77 | health$VOICE == 77 | health$VOICE == 88 | 
                         health$VOICE == 99, NA,
                       ifelse(health$VOICE > 0, 1, 0))
health <- rename(health, c(VOICE = "voice")) # this is only for males

table(health$menarche[health$sex == 2], useNA = "ifany")
table(health$voice[health$sex == 1], useNA = "ifany")

health$puberty <- ifelse(health$sex == 2, health$menarche, health$voice)
table(health$puberty, useNA = "ifany")

nrow(health)

summary(health$wi, useNA = "ifany")

## numeric indicator for country (necessary for imputation)
table(health$country, useNA = "ifany")
health$countrynum[health$country == "ethiopia"] <- 0
health$countrynum[health$country == "india"] <- 1
health$countrynum[health$country == "peru"] <- 2
health$countrynum[health$country == "vietnam"] <- 3

## recoding sex from 1/2 to 0/1 and renaming as female
table(health$sex, useNA = "ifany")
health$sex <- health$sex - 1
health <- rename(health, c(sex = "female"))

## recoding the age of the child into years from months
health$agechild <- health$agechild / 12

nrow(health) # 3,645

## so far, the sample size by country
ddply(health, .(country), summarize, length(childid))


## Is the childid still in school?
## 0-no, 1-yes, attending regularly, 2- yes, attending irregularly

## Ethiopia
etsubhouseholdmember12 <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/ethiopia_r2/etsubhouseholdmember12.dta", convert.factors = FALSE)
colnames(etsubhouseholdmember12) <- tolower(colnames(etsubhouseholdmember12))
hh_e <- etsubhouseholdmember12[etsubhouseholdmember12$id == 0,]
hh_e <- hh_e[c("childid", "chstill")]
table(hh_e$chstill, useNA = "ifany")
## recoding to yes or no
hh_e$chstill <- ifelse(is.na(hh_e$chstill) == TRUE | hh_e$chstill == 77, NA, 
                       ifelse(hh_e$chstill == 0 | hh_e$chstill == 2, 0, 1))
prop.table(table(hh_e$chstill))

## India
insubhouseholdmember12 <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/india_r2/insubhouseholdmember12.dta", convert.factors = FALSE)
colnames(insubhouseholdmember12) <- tolower(colnames(insubhouseholdmember12))
hh_i <- insubhouseholdmember12[insubhouseholdmember12$id == 0,]
hh_i <- hh_i[c("childid", "chstill")]
table(hh_i$chstill, useNA = "ifany")
## recoding to yes or no
hh_i$chstill <- ifelse(is.na(hh_i$chstill) == TRUE | hh_i$chstill == 77, NA, 
                       ifelse(hh_i$chstill == 0 | hh_i$chstill == 2, 0, 1))
prop.table(table(hh_i$chstill))

## Peru
pesubhouseholdmember12 <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/peru_r2/pesubhouseholdmember12.dta", convert.factors = FALSE)
colnames(pesubhouseholdmember12) <- tolower(colnames(pesubhouseholdmember12))
hh_p <- pesubhouseholdmember12[pesubhouseholdmember12$id == 0,]
hh_p <- hh_p[c("childid", "chstill")]
table(hh_p$chstill, useNA = "ifany")
## recoding to yes or no
hh_p$chstill <- ifelse(is.na(hh_p$chstill) == TRUE | hh_p$chstill == 77, NA, 
                       ifelse(hh_p$chstill == 0 | hh_p$chstill == 2, 0, 1))
prop.table(table(hh_p$chstill))

## Vietnam
vnsubhouseholdmember12 <- read.dta(file = "Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/vietnam_r2/vnsubhouseholdmember12.dta", convert.factors = FALSE)
colnames(vnsubhouseholdmember12) <- tolower(colnames(vnsubhouseholdmember12))
hh_v <- vnsubhouseholdmember12[vnsubhouseholdmember12$id == 0,]
hh_v <- hh_v[c("childid", "chstill")]
table(hh_v$chstill, useNA = "ifany")
## recoding to yes or no
hh_v$chstill <- ifelse(is.na(hh_v$chstill) == TRUE | hh_v$chstill == 77, NA, 
                       ifelse(hh_v$chstill == 0 | hh_v$chstill == 2, 0, 1))
prop.table(table(hh_v$chstill))

hh <- rbind(hh_e, hh_i, hh_p, hh_v)
nrow(hh)
hh <- rename(hh, c(chstill = "aschool")) # renaming the variable for whether the adoelscent is in school

## merging with health
health <- merge(health, hh, by = "childid")
table(health$aschool, useNA = "ifany")

## deleting variables I will no longer be using
health$disab01 <- health$disab02 <- health$disab03 <- health$disab04 <- health$disab05 <-
  health$disab07 <- health$disab08 <- health$menarche <- health$voice <- NULL

## Doing the analyses ONLY on a complete data frame

## only complete data frame
health <- health[complete.cases(health),]
nrow(health) # n = 3541, so yep, 104 observations are dropped (3,645-3,541)

## sample size in each 
ddply(health, .(country), summarize, length(childid))


## Recoding and creating a variety of outcome variables

## Reported health variables were originally coded as 1-same, 2-better, 3-worse
## recoding to 0-worse, 1-same, 2-better:
health$healthyrec <- ifelse(health$healthy == 3, 0, health$healthy)
health$chealthyrec <- ifelse(health$chealthy == 3, 0, health$chealthy)

## Creating a variables for same or worse, same or better
health$worse <- ifelse(health$healthyrec == 2, NA, 
                       ifelse(health$healthyrec == 0, 1, 0)) # 0-same, 1-worse
health$cworse <- ifelse(health$chealthyrec == 2, NA, 
                        ifelse(health$chealthyrec == 0, 1, 0)) # 0-same, 1-worse
health$better <- ifelse(health$healthyrec == 0, NA,
                        ifelse(health$healthyrec == 1, 0, 1)) # 0-same, 1-better
health$cbetter <- ifelse(health$chealthyrec == 0, NA,
                         ifelse(health$chealthyrec == 1, 0, 1)) # 0-same, 1-better

## Also creating worse (1) versus same or better (0)
health$worse.alt <- ifelse(health$healthyrec == 0, 1, 0)
health$cworse.alt <- ifelse(health$chealthyrec == 0, 1, 0)

## Adding variable for whether the parent is the biological mother

caregivere <- read.dta(file = "/Users/laura/Dropbox/Young_Lives_New/Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/ethiopia_r2/etsubhouseholdmember12.dta", convert.factors = FALSE)
caregivere <- caregivere[c("CHILDID", "ID", "RELATE", "MEMSEX")]
caregivere <- rename(caregivere, c(CHILDID = "childid", ID = "id1", RELATE = "relate", MEMSEX = "memsex"))

caregiveri <- read.dta(file = "/Users/laura/Dropbox/Young_Lives_New/Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/india_r2/insubhouseholdmember12.dta", convert.factors = FALSE)
caregiveri <- caregiveri[c("CHILDID", "ID", "RELATE", "MEMSEX")]
caregiveri <- rename(caregiveri, c(CHILDID = "childid", ID = "id1", RELATE = "relate", MEMSEX = "memsex"))

caregiverp <- read.dta(file = "/Users/laura/Dropbox/Young_Lives_New/Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/peru_r2/pesubhouseholdmember12.dta", convert.factors = FALSE)
caregiverp <- caregiverp[c("CHILDID", "ID", "RELATE", "MEMSEX")]
caregiverp <- rename(caregiverp, c(CHILDID = "childid", ID = "id1", RELATE = "relate", MEMSEX = "memsex"))

caregiverv <- read.dta(file = "/Users/laura/Dropbox/Young_Lives_New/Round_2_2006/UKDA-6852-stata11_se/stata11_se/survey_data_r2/vietnam_r2/vnsubhouseholdmember12.dta", convert.factors = FALSE)
caregiverv <- caregiverv[c("CHILDID", "ID", "RELATE", "MEMSEX")]
caregiverv <- rename(caregiverv, c(CHILDID = "childid", ID = "id1", RELATE = "relate", MEMSEX = "memsex"))

caregiver <- rbind(caregivere, caregiveri, caregiverp, caregiverv)

health <- merge(health, caregiver, by = c("childid", "id1"), all.x = TRUE) # 

nrow(health)

table(health$relate, useNA = "ifany")
health$relate <- ifelse(health$relate == 1, 0, 1) # recoding relate as 0-biological parent and 1-not biological parent

table(health$memsex, useNA = "ifany") # 1-male, 2-female
# recoding memsex as 0-male, 1-female and calling it repfemale
health$repfemale <- ifelse(health$memsex == 1, 0, 1)

## variable for whether the parent is the biological mother:
health$biomum <- ifelse(health$repfemale == 1 & health$relate == 0, 1, 0)

ddply(health, .(country), summarize, mean(repfemale))


####### Figures and Tables ######

## Bar graph of adolescent SRH and parent health report
#### FIGURE 1 ####

formelt <- health[c("childid", "country", "healthyrec", "chealthyrec")]
reshaped <- melt(formelt, id.vars = c("childid", "country"), variable.name = "whorated", value.name = "rephealth")
reshaped$rephealth <- as.factor(reshaped$rephealth)
levels(reshaped$rephealth)[levels(reshaped$rephealth) == "0"] <- "Worse"
levels(reshaped$rephealth)[levels(reshaped$rephealth) == "1"] <- "Same"
levels(reshaped$rephealth)[levels(reshaped$rephealth) == "2"] <- "Better"
reshaped$country <- as.factor(reshaped$country)
levels(reshaped$country)[levels(reshaped$country) == "ethiopia"] <- "Ethiopia"
levels(reshaped$country)[levels(reshaped$country) == "india"] <- "India"
levels(reshaped$country)[levels(reshaped$country) == "peru"] <- "Peru"
levels(reshaped$country)[levels(reshaped$country) == "vietnam"] <- "Vietnam"

## to get the proportions
littlen <- ddply(reshaped, .(country, whorated), summarize, N = length(childid))
# adding summary statistics for all countries 
littlen_all <- ddply(reshaped, .(whorated), summarize,  N = length(childid))
littlen_all$country <- "All"
littlen <- rbind(littlen, littlen_all)
bign <- ddply(reshaped, .(country, rephealth, whorated), summarize, n = length(rephealth))
# adding summary statistics for all countries
bign_all <- ddply(reshaped, .(rephealth, whorated), summarize, n = length(rephealth)) 
bign_all$country <- "All" # adding summary statistics for all countries
bign <- rbind(bign, bign_all)
proportion <- merge(littlen, bign, by = c("country", "whorated"))
proportion$prop <- proportion$n / proportion$N

# svg("/Users/laura/Desktop/test.svg",width=14,height=7, pointsize=12)

# Cairo(file="/Users/laura/Desktop/test.svg", 
      # type="svg",
      # units="in", 
      # width=5, 
      # height=4, 
      # pointsize=12, 
      # dpi=96)
ggplot(data = proportion, 
       aes(x = rephealth, y = prop, fill = whorated)) + 
  geom_bar(stat = "identity", aes(fill = whorated), position = position_dodge()) + 
  facet_grid(. ~ country) +
  theme_bw() +
  scale_fill_brewer(palette = "Paired", labels = c("Parent", "Adolescent"), name = "Who rated \n health") +
  labs(x = "Reported health") +
  ylab("proportion") +
  theme(axis.text.x = element_text(angle = 50, vjust = .5, size = 24),
        axis.title.x = element_text(face = "bold", size = 26),
        axis.text.y = element_text(size = 24),
        axis.title.y = element_text(face = "bold", size = 26),
        plot.title = element_text(size = 30), 
        legend.title = element_text(size = 20),
        legend.text = element_text(size = 18),
        strip.text.x = element_text(face = "bold", size = 24))

# dev.off()


## Health report concordance matricies:

#### TABLE 1 ####
table(health$healthyrec, useNA = "ifany")
table(health$chealthyrec, useNA = "ifany")

table(health$chealthyrec[health$healthyrec == 0])
table(health$chealthyrec[health$healthyrec == 1])
table(health$chealthyrec[health$healthyrec == 2])

prop.table(table(health$chealthyrec[health$healthyrec == 0]))
prop.table(table(health$chealthyrec[health$healthyrec == 1]))
prop.table(table(health$chealthyrec[health$healthyrec == 2]))

#### APPENDIX TABLE A1 ####
# for boys
table(health$chealthyrec[health$healthyrec == 0 & health$female == 0])
table(health$chealthyrec[health$healthyrec == 1 & health$female == 0])
table(health$chealthyrec[health$healthyrec == 2 & health$female == 0])

prop.table(table(health$chealthyrec[health$healthyrec == 0 & health$female == 0]))
prop.table(table(health$chealthyrec[health$healthyrec == 1 & health$female == 0]))
prop.table(table(health$chealthyrec[health$healthyrec == 2 & health$female == 0]))

#### APPENDIX TABLE A2 ####
# for girls
table(health$chealthyrec[health$healthyrec == 0 & health$female == 1])
table(health$chealthyrec[health$healthyrec == 1 & health$female == 1])
table(health$chealthyrec[health$healthyrec == 2 & health$female == 1])

prop.table(table(health$chealthyrec[health$healthyrec == 0 & health$female == 1]))
prop.table(table(health$chealthyrec[health$healthyrec == 1 & health$female == 1]))
prop.table(table(health$chealthyrec[health$healthyrec == 2 & health$female == 1]))

#### Proportion overlap between parent and adolescent health ratings
table(health$healthyrec, health$chealthyrec)
prop.table(table(health$healthyrec, health$chealthyrec))


## Polychoric correlation between the health reports
polychor(health$healthyrec, health$chealthyrec) # 0.5588
# By gender
polychor(health$healthyrec[health$female == 1], health$chealthyrec[health$female == 1]) # 0.5711
polychor(health$healthyrec[health$female == 0], health$chealthyrec[health$female == 0]) # 0.5447
# By whether the parent is biological
polychor(health$healthyrec[health$biomum == 1], health$chealthyrec[health$biomum == 1]) # 0.5583
polychor(health$healthyrec[health$biomum == 0], health$chealthyrec[health$biomum == 0]) # 0.5644


#### TABLE 2 ####

prop.table(table(health$mightdie))
prop.table(table(health$longterm))
prop.table(table(health$anydis))
prop.table(table(health$stunted))
prop.table(table(health$foodshrt))
prop.table(table(health$puberty))
prop.table(table(health$female))
prop.table(table(health$pschool))
sd(health$pschool)
prop.table(table(health$aschool))
sd(health$aschool, na.rm = T)
summary(health$wi)


## REGRESSIONS OF HEALTH REPORTS ON ALL PHYS HEALTH VARS + wi

## Models for FIGURES 2 and 3 and APPENDIX TABLE A3
lm1.1 <- glm(worse ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
               female + wi + factor(pschool) + aschool + factor(country), data = health, family = "binomial")
pR2(lm1.1) 
lm2.1 <- glm(cworse ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
               female + wi + factor(pschool) + aschool + factor(country), data = health, family = "binomial")
pR2(lm2.1) 
lm3.1 <- glm(better ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
               female + wi + factor(pschool) + aschool + factor(country), data = health, family = "binomial")
pR2(lm3.1) 
lm4.1 <- glm(cbetter ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
               female + wi + factor(pschool) + aschool + factor(country), data = health, family = "binomial")
pR2(lm4.1) 

## Forest plot of coefficients for WORSE (1) as compared to the same (1) health
#### FIGURE 2 ####
lm1.1coef <- data.frame(exp(cbind(coef(lm1.1), confint(lm1.1)))) 
lm1.1coef <- lm1.1coef[2:8,] # keeping just the relevant covariates
names(lm1.1coef) <- c("OR", "lower", "upper")
lm1.1coef$vars <- row.names(lm1.1coef)
lm1.1coef$Respondent <- rep("Parent", nrow(lm1.1coef))

lm2.1coef <- data.frame(exp(cbind(coef(lm2.1), confint(lm2.1)))) 
lm2.1coef <- lm2.1coef[2:8,] # keeping just the relevant covariates
names(lm2.1coef) <- c("OR", "lower", "upper")
lm2.1coef$vars <- row.names(lm2.1coef)
lm2.1coef$Respondent <- rep("Adolescent", nrow(lm2.1coef))

worseOR <- rbind(lm1.1coef, lm2.1coef)

ggplot(worseOR, aes(y = OR, x = reorder(vars, OR), ymin = lower, ymax = upper, linetype = Respondent)) +
  geom_point(position = position_dodge(width=0.5), size = 3) +
  geom_errorbar(width = .5, position=position_dodge(width=0.5), size = 1) +
  scale_y_continuous(breaks = c(.5, 1.0, 2.0, 4.0, 8.0), limits = c(.5, 8), trans = "log10") +
  geom_hline(yintercept = 1, linetype=2) +
  coord_flip() +
  labs(x = "Variables", y = "OR") +
  scale_x_discrete(labels=c("Puberty", "Female", "Food shortage",
                            "Stunted", "Poor physical \nfunctioning", 
                            "Might die", "Long term \nhealth problem")) +
  theme_bw() +
  #ggtitle("Logistic regression of worse as compared \nto same health report (OR)") +
  theme(axis.text.x = element_text(size = 24),
        axis.title.x = element_text(face = "bold", size = 28),
        axis.text.y = element_text(size = 24),
        axis.title.y = element_text(face = "bold", size = 28),
        legend.title = element_text(size = 24),
        legend.text = element_text(size = 26),
        legend.position = c(0.7, .2),
        title = element_text(face = "bold", size = 30)) +
  guides(linetype = guide_legend(reverse = TRUE))



## Forest plot of coefficients for BETTER (1) as compared to the same (0) health
#### FIGURE 3 ####
lm3.1coef <- data.frame(exp(cbind(coef(lm3.1), confint(lm3.1)))) 
lm3.1coef <- lm3.1coef[2:8,] # keeping just the relevant covariates
names(lm3.1coef) <- c("OR", "lower", "upper")
lm3.1coef$vars <- row.names(lm3.1coef)
lm3.1coef$Respondent <- rep("Parent", nrow(lm3.1coef))

lm4.1coef <- data.frame(exp(cbind(coef(lm4.1), confint(lm4.1)))) 
lm4.1coef <- lm4.1coef[2:8,] # keeping just the relevant covariates
names(lm4.1coef) <- c("OR", "lower", "upper")
lm4.1coef$vars <- row.names(lm4.1coef)
lm4.1coef$Respondent <- rep("Adolescent", nrow(lm4.1coef))

betterOR <- rbind(lm3.1coef, lm4.1coef)

ggplot(betterOR, aes(y = OR, x = reorder(vars, OR), ymin = lower, ymax = upper, linetype = Respondent)) +
  geom_point(position = position_dodge(width=0.5), size = 3) +
  geom_errorbar(aes(ymax = upper, ymin = lower), width = .5, 
                position=position_dodge(width=0.5), size = 1,
                stat = "identity") +
  scale_y_continuous(breaks = c(.25, .5, 1.0, 2.0, 4.0), limits = c(.25, 4), trans = "log10") +
  geom_hline(yintercept = 1, linetype=2) +
  coord_flip() +
  labs(x = "Variables", y = "OR") +
  scale_x_discrete(labels=c("Poor physical \nfunctioning",
                            "Long term \nhealth problem", "Stunted", 
                            "Food shortage", "Might die", "Puberty", "Female")) +
  theme_bw() +
  #ggtitle("Logistic regression of better as compared \nto same health report (OR)") +
  theme(axis.text.x = element_text(size = 24),
        axis.title.x = element_text(face = "bold", size = 28),
        axis.text.y = element_text(size = 24),
        axis.title.y = element_text(face = "bold", size = 28),
        legend.title = element_text(size = 24),
        legend.text = element_text(size = 26),
        legend.position = c(.3, .7),
        title = element_text(face = "bold", size = 30)) +
  guides(linetype = guide_legend(reverse = TRUE))




## 2. rerunning the original physical health model with the new outcome - separately for parents and children
## worse.alt = worse (1) versus same or better (0)

# Models for FIGURE 4 and APPENDIX Table A4

lm2.1 <- glm(worse.alt ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
               female + wi + factor(pschool) + aschool + factor(country), data = health, family = "binomial") # parent
pR2(lm2.1) 
lm2.2 <- glm(cworse.alt ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
               female + wi + factor(pschool) + aschool + factor(country), data = health, family = "binomial") # adolescent
pR2(lm2.2)


## COMPARING THE COEFFICIENTS FROM THE TWO MODELS

test1 <- worse.alt ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
               female + wi + factor(pschool) + aschool + factor(country) 
test2 <- cworse.alt ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
              female + wi + factor(pschool) + aschool + factor(country) 

fit <- systemfit(list(parent = test1, adolescent = test2), data = health)
summary(fit)

restriction1 <- "parent_mightdie - adolescent_mightdie" 
linearHypothesis(fit, restriction1, test = "Chisq")

restriction2 <- "parent_longterm - adolescent_longterm" 
linearHypothesis(fit, restriction2, test = "Chisq")

restriction3 <- "parent_anydis - adolescent_anydis" 
linearHypothesis(fit, restriction3, test = "Chisq")

restriction4 <- "parent_stunted - adolescent_stunted" 
linearHypothesis(fit, restriction4, test = "Chisq")

restriction5 <- "parent_foodshrt - adolescent_foodshrt"
linearHypothesis(fit, restriction5, test = "Chisq")

restriction6 <- "parent_puberty - adolescent_puberty"
linearHypothesis(fit, restriction6, test = "Chisq")

restriction7 <- "parent_female - adolescent_female"
linearHypothesis(fit, restriction7, test = "Chisq")

restriction8 <- "parent_wi - adolescent_wi" 
linearHypothesis(fit, restriction8, test = "Chisq")

# testing whether males have experienced some of the physical health problems

t.test(health$mightdie[health$female == 1], health$mightdie[health$female == 0])
t.test(health$longterm[health$female == 1], health$longterm[health$female == 0])
t.test(health$anydis[health$female == 1], health$anydis[health$female == 0])
t.test(health$stunted[health$female == 1], health$stunted[health$female == 0])
t.test(health$foodshrt[health$female == 1], health$foodshrt[health$female == 0])
t.test(health$puberty[health$female == 1], health$puberty[health$female == 0])


## Forest plot of coefficients for worse (1) as compared to the same or better (1) health 

#### FIGURE 4 ####
lm2.1coef <- data.frame(exp(cbind(coef(lm2.1), confint(lm2.1)))) 
lm2.1coef <- lm2.1coef[2:8,] # keeping just the relevant covariates
names(lm2.1coef) <- c("OR", "lower", "upper")
lm2.1coef$vars <- row.names(lm2.1coef)
lm2.1coef$Respondent <- rep("Parent", nrow(lm2.1coef))

lm2.2coef <- data.frame(exp(cbind(coef(lm2.2), confint(lm2.2)))) 
lm2.2coef <- lm2.2coef[2:8,] # keeping just the relevant covariates
names(lm2.2coef) <- c("OR", "lower", "upper")
lm2.2coef$vars <- row.names(lm2.2coef)
lm2.2coef$Respondent <- rep("Adolescent", nrow(lm2.2coef))

alt.worseOR <- rbind(lm2.1coef, lm2.2coef)

ggplot(alt.worseOR, aes(y = OR, x = reorder(vars, OR), ymin = lower, ymax = upper, linetype = Respondent)) +
  geom_point(position = position_dodge(width=0.5), size = 3) +
  geom_errorbar(width = .5, position=position_dodge(width=0.5), size = 1) +
  scale_y_continuous(breaks = c(.1, .5, 1.0, 2.0, 4.0, 8.0), limits = c(.5, 8), trans = "log10") +
  geom_hline(yintercept = 1, linetype=2) +
  coord_flip() +
  labs(x = "Variables", y = "OR") +
  scale_x_discrete(labels = c("Puberty", "Female", "Food shortage", "Stunted",
                              "Poor physical \nfunctioning", "Might die", 
                              "Long term \n health problem")) +
  theme_bw() +
  #ggtitle("Logistic regression of worse as compared to \nsame or better health report (OR)") +
  theme(axis.text.x = element_text(size = 24),
        axis.title.x = element_text(face = "bold", size = 28),
        axis.text.y = element_text(size = 24),
        axis.title.y = element_text(face = "bold", size = 28),
        legend.title = element_text(size = 24),
        legend.text = element_text(size = 26),
        legend.position = c(0.7, .2),
        title = element_text(face = "bold", size = 30)) +
  guides(linetype = guide_legend(reverse = TRUE))



## Re-running the final/preferred analyses (worse v. same or better) JUST for biological mothers:
## NOTE: this was requested by a reviewer and mentioned in the paper. It is not in tables/figures
lm2.1_bio <- glm(worse.alt ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
               female + wi + factor(pschool) + aschool + factor(country), 
             data = health[health$biomum == 1,], family = "binomial") # parent
pR2(lm2.1_bio)
lm2.2_bio <- glm(cworse.alt ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
               female + wi + factor(pschool) + aschool + factor(country), 
             data = health[health$biomum == 1,], family = "binomial") # adolescent
pR2(lm2.2_bio)

lm2.1coef_bio <- data.frame(exp(cbind(coef(lm2.1_bio), confint(lm2.1_bio)))) 
lm2.1coef_bio <- lm2.1coef_bio[2:8,] # keeping just the relevant covariates
names(lm2.1coef_bio) <- c("OR", "lower", "upper")
lm2.1coef_bio$vars <- row.names(lm2.1coef_bio)
lm2.1coef_bio$Respondent <- rep("Parent", nrow(lm2.1coef_bio))

lm2.2coef_bio <- data.frame(exp(cbind(coef(lm2.2_bio), confint(lm2.2_bio)))) 
lm2.2coef_bio <- lm2.2coef_bio[2:8,] # keeping just the relevant covariates
names(lm2.2coef_bio) <- c("OR", "lower", "upper")
lm2.2coef_bio$vars <- row.names(lm2.2coef_bio)
lm2.2coef_bio$Respondent <- rep("Adolescent", nrow(lm2.2coef_bio))

alt.worseOR_bio <- rbind(lm2.1coef_bio, lm2.2coef_bio)

ggplot(alt.worseOR_bio, aes(y = OR, x = reorder(vars, OR), ymin = lower, ymax = upper, linetype = Respondent)) +
  geom_point(position = position_dodge(width=0.5), size = 3) +
  geom_errorbar(width = .5, position=position_dodge(width=0.5), size = 1) +
  #scale_y_log10(breaks = c(0.6, 0.8, 1.0, 2.0, 4.0, 6.0)) + 
  scale_y_continuous(breaks = c(.1, .5, 1.0, 2.0, 4.0, 8.0), limits = c(.5, 8), trans = "log10") +
  geom_hline(yintercept = 1, linetype=2) +
  coord_flip() +
  labs(x = "Variables", y = "OR") +
  scale_x_discrete(labels = c("Female", "Puberty", "Food shortage", "Stunted",
                              "Poor physical \nfunctioning", "Might die", 
                              "Long term \n health problem")) +
  theme_bw() +
  #ggtitle("Logistic regression of worse as compared to \nsame or better health report (OR) \n Just bio parents") +
  theme(axis.text.x = element_text(size = 24),
        axis.title.x = element_text(face = "bold", size = 28),
        axis.text.y = element_text(size = 24),
        axis.title.y = element_text(face = "bold", size = 28),
        legend.title = element_text(size = 24),
        legend.text = element_text(size = 26),
        legend.position = c(0.7, .2),
        title = element_text(face = "bold", size = 30)) +
  guides(linetype = guide_legend(reverse = TRUE))


## Running the models separately by genger (APPENDIX FIGURES A1 and A2)

## First for males
#### FIGURE A1 ####

## PARENT regression
test <- (glm(worse.alt ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
               factor(country) + wi + factor(pschool) + aschool, 
             data = health[health$female == 0,], family = "binomial"))
# exponentiating for odds ratios and their confidence intervals
test.coef <- data.frame(exp(cbind(coef(test), confint(test))))
test.coef <- test.coef[2:7,] # only health conditions
names(test.coef) <- c("OR", "lower", "upper")
test.coef$vars <- row.names(test.coef)
test.coef$Respondent <- rep("Parent", nrow(test.coef))

# ADOLESCENT regression
test2 <- (glm(cworse.alt ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
                factor(country) + wi + factor(pschool) + aschool, 
              data = health[health$female == 0,], family = "binomial"))
test2.coef <- data.frame(exp(cbind(coef(test2), confint(test2))))
test2.coef <- test2.coef[2:7,]
names(test2.coef) <- c("OR", "lower", "upper")
test2.coef$vars <- row.names(test2.coef)
test2.coef$Respondent <- rep("Adolescent", nrow(test2.coef))
# Putting the two things together
worseOR <- data.frame(rbind(test.coef, test2.coef))

# Forest plot - regression of better as compared to same health report JUST BIOLOGICAL PARENTS

ggplot(worseOR, aes(y = OR, x = reorder(vars, OR), ymin = lower, ymax = upper, linetype = Respondent)) +
  geom_point(position = position_dodge(width=0.5), size = 3) +
  geom_errorbar(width = .5, position=position_dodge(width=0.5), size = 1) +
  scale_y_continuous(breaks = c(.25, .5, 1.0, 2.0, 4.0, 8.0, 16.0), limits = c(.25, 16), trans = "log10") +
  geom_hline(yintercept = 1, linetype=2) +
  coord_flip() +
  labs(x = "Variables", y = "OR") +
  scale_x_discrete(labels = c("Puberty", "Food shortage", "Stunted", "Might die",  
                              "Poor physical \n functioning", "Long term \n health problem")) +
  theme_bw() +
  #ggtitle("Logistic regression of worse as compared to \nsame or better health report (OR), males") +
  theme(axis.text.x = element_text(size = 24),
        axis.title.x = element_text(face = "bold", size = 28),
        axis.text.y = element_text(size = 24),
        axis.title.y = element_text(face = "bold", size = 28),
        legend.title = element_text(size = 24),
        legend.text = element_text(size = 26),
        legend.position = c(.8, .2),
        title = element_text(face = "bold", size = 30)) +
  guides(linetype = guide_legend(reverse = TRUE))


### Then for Females
#### FIGURE A2 ####

## PARENT regression
test <- (glm(worse.alt ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
               factor(country) + wi + factor(pschool) + aschool, 
             data = health[health$female == 1,], family = "binomial"))
# exponentiating for odds ratios and their confidence intervals
test.coef <- data.frame(exp(cbind(coef(test), confint(test))))
test.coef <- test.coef[2:7,] # only health conditions
names(test.coef) <- c("OR", "lower", "upper")
test.coef$vars <- row.names(test.coef)
test.coef$Respondent <- rep("Parent", nrow(test.coef))

# ADOLESCENT regression
test2 <- (glm(cworse.alt ~ mightdie + longterm + anydis + stunted + foodshrt + puberty +
                factor(country) + wi + factor(pschool) + aschool, 
              data = health[health$female == 1,], family = "binomial"))
test2.coef <- data.frame(exp(cbind(coef(test2), confint(test2))))
test2.coef <- test2.coef[2:7,]
names(test2.coef) <- c("OR", "lower", "upper")
test2.coef$vars <- row.names(test2.coef)
test2.coef$Respondent <- rep("Adolescent", nrow(test2.coef))
# Putting the two things together
worseOR <- data.frame(rbind(test.coef, test2.coef))

# Forest plot - regression of better as compared to same health report

ggplot(worseOR, aes(y = OR, x = reorder(vars, OR), ymin = lower, ymax = upper, linetype = Respondent)) +
  geom_point(position = position_dodge(width=0.5), size = 3) +
  geom_errorbar(width = .5, position=position_dodge(width=0.5), size = 1) +
  scale_y_continuous(breaks = c(.25, .5, 1.0, 2.0, 4.0, 8.0), limits = c(.25, 8), trans = "log10") +
  geom_hline(yintercept = 1, linetype=2) +
  coord_flip() +
  labs(x = "Variables", y = "OR") +
  scale_x_discrete(labels = c("Puberty", "Food shortage", "Stunted", "Poor physical \n functioning", 
                              "Might die", "Long term \n health problem")) +
  theme_bw() +
  #ggtitle("Logistic regression of worse as compared to \nsame or better health report (OR), females") +
  theme(axis.text.x = element_text(size = 24),
        axis.title.x = element_text(face = "bold", size = 28),
        axis.text.y = element_text(size = 24),
        axis.title.y = element_text(face = "bold", size = 28),
        legend.title = element_text(size = 24),
        legend.text = element_text(size = 26),
        legend.position = c(.8, .2),
        title = element_text(face = "bold", size = 30)) +
  guides(linetype = guide_legend(reverse = TRUE))


## SENSITIVITY AND SPECIFICITY

# sample
lvs <- c("normal", "abnormal")
truth <- factor(rep(lvs, times = c(86, 258)),
                levels = rev(lvs))
pred <- factor(c(
    rep(lvs, times = c(54, 32)),
    rep(lvs, times = c(27, 231))),               
  levels = rev(lvs))

xtab <- table(pred, truth)

sensitivity(pred, truth)

## worse (1) versus same or better (0)
## when using the table function, the first listed var is the row,
## and the second-listed var is the column

table(health$stunted[health$cworse.alt == 1])
table(health$stunted[health$cworse.alt == 0])

table(health$stunted[health$worse.alt == 1])
table(health$stunted[health$worse.alt == 0])

adolescent <- table(health$cworse.alt, health$stunted)
confusionMatrix(adolescent)

parent <- table(health$worse.alt, health$stunted)
confusionMatrix(parent)

# worse as compared to same
adolescent2 <- table(health$cworse[is.na(health$cworse) == FALSE], 
                     health$stunted[is.na(health$cworse) == FALSE])
confusionMatrix(adolescent2)

parent2 <- table(health$worse[is.na(health$worse) == FALSE], 
                     health$stunted[is.na(health$worse) == FALSE])
confusionMatrix(parent2)

# better as compared to same
adolescent3 <- table(health$cbetter[is.na(health$cbetter) == FALSE],
                     health$stunted[is.na(health$cbetter) == FALSE])
confusionMatrix(adolescent3)

parent3 <- table(health$better[is.na(health$better) == FALSE],
                 health$stunted[is.na(health$better) == FALSE])
confusionMatrix(parent3)


## By country:

sens_cntry_adolesc <- function(x) {
  adolescent <- table(x$cworse.alt, x$stunted)
  confusionMatrix(adolescent)
}

dlply(health, .(country), sens_cntry_adolesc)


sens_cntry_parent <- function(x) {
  parent <- table(x$worse.alt, x$stunted)
  confusionMatrix(parent)
}

dlply(health, .(country), sens_cntry_parent)



## MULTINOMIAL LOGISTIC REGRESSION

#### TABLE 3 ####

# The multinom package does not include p-value calculation for 
# the regression coefficients, so we calculate p-values using 
# Wald tests (here z-tests).

# reseting the reference so it is same - to compare worse to same
# and better to same 

health$healthyrec <- factor(health$healthyrec)
health$healthyrec <- relevel(health$healthyrec, "1")

health$chealthyrec <- factor(health$chealthyrec)
health$chealthyrec <- relevel(health$chealthyrec, "1")


# First the adolescent's health report:
adolescent_multinomial <- multinom(chealthyrec ~ mightdie + longterm + anydis + 
                          stunted + foodshrt + puberty + female + 
                          factor(country) + wi + factor(pschool) + 
                          aschool, data = health)
summary(adolescent_multinomial)
adolescent_z <- summary(adolescent_multinomial)$coefficients/summary(adolescent_multinomial)$standard.errors
adolescent_p <- (1 - pnorm(abs(adolescent_z), 0, 1)) * 2
adolescent_p

exp(coef(adolescent_multinomial))
exp(confint(adolescent_multinomial)) # this has two items...

# Then the parent's health report:
parent_multinomial <- multinom(healthyrec ~ mightdie + longterm + anydis + 
                          stunted + foodshrt + puberty + female + 
                          factor(country) + wi + factor(pschool) + 
                          aschool, data = health)
summary(parent_multinomial)

parent_z <- summary(parent_multinomial)$coefficients/summary(parent_multinomial)$standard.errors
parent_p <- (1 - pnorm(abs(parent_z), 0, 1)) * 2
parent_p

exp(coef(parent_multinomial))
exp(confint(parent_multinomial)) # this has two items...




## PRE-IMPUTATION DIAGNOSTICS

for_imp <- health[c("childid", "healthyrec", "chealthyrec", "mightdie", "longterm",
                    "anydis", "stunted", "foodshrt", "female",
                    "wi", "puberty", "countrynum", "zhfa", "pschool", "aschool")]

# note that the health variables are factor variables and need to be numeric
class(for_imp$healthyrec)
table(for_imp$healthyrec, useNA = "ifany")
# creating a numeric variable
for_imp$healthyrec_num <- ifelse(for_imp$healthyrec == 0, 0, 
                                 ifelse(for_imp$healthyrec == 1, 1, 2))
class(for_imp$healthyrec_num)
table(for_imp$healthyrec_num)
  
class(for_imp$chealthyrec)
table(for_imp$healthyrec, useNA = "ifany")
for_imp$chealthyrec_num <- ifelse(for_imp$chealthyrec == 0, 0, 
                                 ifelse(for_imp$chealthyrec == 1, 1, 2))
class(for_imp$chealthyrec_num)
table(for_imp$chealthyrec_num)

# replacing the factors with the numeric
for_imp$healthyrec <- for_imp$chealthyrec <- NULL

for_imp <- rename(for_imp, c("chealthyrec_num" = "chealthyrec", "healthyrec_num" = "healthyrec"))

head(for_imp)


## only complete cases:
for_imp <- for_imp[complete.cases(for_imp),] # 205 obs missing on at least one covariate
nrow(for_imp)

## what is the correlation between parent and child health reports?
cor(for_imp$healthyrec, for_imp$chealthyrec) # only 0.45...

## info on the data frame for imputation
mi.info(for_imp) # should be the complete data frame

for_imp$randomsid <- row.names(for_imp)

## generating random lists of row numbers (i.e. unique ids for the adolescents)
## generating random numbers with which to set the seeds:

## Doing the procedure from 25 missing to 425 missing in increments of 50
## the sample size is 3541 so this ranges from less than 1 percent to 12 percent missing
## This would require I graph 25, 75, 125, 175, 225, 275, 325, 375, 425

set.seed(440)
randoms25 <- as.integer(sample(for_imp$randomsid, 25, replace = FALSE))
set.seed(440)
randoms75 <- as.integer(sample(for_imp$randomsid, 75, replace = FALSE))
set.seed(440)
randoms125 <- as.integer(sample(for_imp$randomsid, 125, replace = FALSE))
set.seed(440)
randoms175 <- as.integer(sample(for_imp$randomsid, 175, replace = FALSE))
set.seed(440)
randoms225 <- as.integer(sample(for_imp$randomsid, 225, replace = FALSE))
set.seed(440)
randoms275 <- as.integer(sample(for_imp$randomsid, 275, replace = FALSE))
set.seed(440)
randoms325 <- as.integer(sample(for_imp$randomsid, 325, replace = FALSE))
set.seed(440)
randoms375 <- as.integer(sample(for_imp$randomsid, 375, replace = FALSE))
set.seed(440)
randoms425 <- as.integer(sample(for_imp$randomsid, 425, replace = FALSE))

## Possible imputations of wave 2 information:
# - Impute parent report with and without SRH
# - Impute adolescent report with and without parent's report

cor(for_imp$healthyrec, for_imp$chealthyrec, method = "spearman") # 0.442

## Number missing options
imp25 <- for_imp
imp25$ifin <- as.factor(imp25$randomsid %in% randoms25)
table(imp25$ifin)

imp75 <- for_imp
imp75$ifin <- as.factor(imp75$randomsid %in% randoms75)
table(imp75$ifin)

imp125 <- for_imp
imp125$ifin <- as.factor(imp125$randomsid %in% randoms125)
table(imp125$ifin)

imp175 <- for_imp
imp175$ifin <- as.factor(imp175$randomsid %in% randoms175)
table(imp175$ifin)

imp225 <- for_imp
imp225$ifin <- as.factor(imp225$randomsid %in% randoms225)
table(imp225$ifin)

imp275 <- for_imp
imp275$ifin <- as.factor(imp275$randomsid %in% randoms275)
table(imp275$ifin)

imp325 <- for_imp
imp325$ifin <- as.factor(imp325$randomsid %in% randoms325)
table(imp325$ifin)

imp375 <- for_imp
imp375$ifin <- as.factor(imp375$randomsid %in% randoms375)
table(imp375$ifin)

imp425 <- for_imp
imp425$ifin <- as.factor(imp425$randomsid %in% randoms425)
table(imp425$ifin)

## FIRST WTIH ADOLESCENT SRH ##

## 25 missings
p.imp25wSRH <- imp25[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                       "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                       "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp25 <- p.imp25wSRH
p.imp25$healthyrec <- ifelse(p.imp25$ifin == "TRUE", NA, p.imp25$healthyrec)
p.imp25$ifin <- NULL
p.imputed25 <- mi(p.imp25, max.minutes = 30, seed = 4)
p.imputed25.list <- mi.completed(p.imputed25) # extracting the data frames
df <- ldply(p.imputed25.list, data.frame) # rbinding
p.means25 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr25 <- merge(p.imp25wSRH, p.means25, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.wSRH25 <- cor(forcorr25$healthyrec[forcorr25$ifin == TRUE], 
                    forcorr25$imphealthyrec[forcorr25$ifin == TRUE], method = "spearman")
p.imp.wSRH25

## 75 missings
p.imp75wSRH <- imp75[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                       "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                       "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp75 <- p.imp75wSRH
p.imp75$healthyrec <- ifelse(p.imp75$ifin == "TRUE", NA, p.imp75$healthyrec)
p.imp75$ifin <- NULL
p.imputed75 <- mi(p.imp75, max.minutes = 30, seed = 4)
p.imputed75.list <- mi.completed(p.imputed75) # extracting the data frames
df <- ldply(p.imputed75.list, data.frame) # rbinding
p.means75 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr75 <- merge(p.imp75wSRH, p.means75, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.wSRH75 <- cor(forcorr75$healthyrec[forcorr75$ifin == TRUE], 
                    forcorr75$imphealthyrec[forcorr75$ifin == TRUE], method = "spearman")
p.imp.wSRH75

## 125 missings
p.imp125wSRH <- imp125[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                         "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                         "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp125 <- p.imp125wSRH
p.imp125$healthyrec <- ifelse(p.imp125$ifin == "TRUE", NA, p.imp125$healthyrec)
p.imp125$ifin <- NULL
p.imputed125 <- mi(p.imp125, max.minutes = 30, seed = 4)
p.imputed125.list <- mi.completed(p.imputed125) # extracting the data frames
df <- ldply(p.imputed125.list, data.frame) # rbinding
p.means125 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr125 <- merge(p.imp125wSRH, p.means125, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.wSRH125 <- cor(forcorr125$healthyrec[forcorr125$ifin == TRUE], 
                     forcorr125$imphealthyrec[forcorr125$ifin == TRUE], method = "spearman")
p.imp.wSRH125

## 175 missings
p.imp175wSRH <- imp175[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                         "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                         "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp175 <- p.imp175wSRH
p.imp175$healthyrec <- ifelse(p.imp175$ifin == "TRUE", NA, p.imp175$healthyrec)
p.imp175$ifin <- NULL
p.imputed175 <- mi(p.imp175, max.minutes = 30, seed = 4)
p.imputed175.list <- mi.completed(p.imputed175) # extracting the data frames
df <- ldply(p.imputed175.list, data.frame) # rbinding
p.means175 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr175 <- merge(p.imp175wSRH, p.means175, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.wSRH175 <- cor(forcorr175$healthyrec[forcorr175$ifin == TRUE], 
                     forcorr175$imphealthyrec[forcorr175$ifin == TRUE], method = "spearman")
p.imp.wSRH175

## 225 missings
p.imp225wSRH <- imp225[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                         "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                         "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp225 <- p.imp225wSRH
p.imp225$healthyrec <- ifelse(p.imp225$ifin == "TRUE", NA, p.imp225$healthyrec)
p.imp225$ifin <- NULL
p.imputed225 <- mi(p.imp225, max.minutes = 30, seed = 4)
p.imputed225.list <- mi.completed(p.imputed225) # extracting the data frames
df <- ldply(p.imputed225.list, data.frame) # rbinding
p.means225 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr225 <- merge(p.imp225wSRH, p.means225, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.wSRH225 <- cor(forcorr225$healthyrec[forcorr225$ifin == TRUE], 
                     forcorr225$imphealthyrec[forcorr225$ifin == TRUE], method = "spearman")
p.imp.wSRH225

## 275 missings
p.imp275wSRH <- imp275[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                         "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                         "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp275 <- p.imp275wSRH
p.imp275$healthyrec <- ifelse(p.imp275$ifin == "TRUE", NA, p.imp275$healthyrec)
p.imp275$ifin <- NULL
p.imputed275 <- mi(p.imp275, max.minutes = 30, seed = 4)
p.imputed275.list <- mi.completed(p.imputed275) # extracting the data frames
df <- ldply(p.imputed275.list, data.frame) # rbinding
p.means275 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr275 <- merge(p.imp275wSRH, p.means275, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.wSRH275 <- cor(forcorr275$healthyrec[forcorr275$ifin == TRUE], 
                     forcorr275$imphealthyrec[forcorr275$ifin == TRUE], method = "spearman")
p.imp.wSRH275

## 325 missings
p.imp325wSRH <- imp325[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                         "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                         "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp325 <- p.imp325wSRH
p.imp325$healthyrec <- ifelse(p.imp325$ifin == "TRUE", NA, p.imp325$healthyrec)
p.imp325$ifin <- NULL
p.imputed325 <- mi(p.imp325, max.minutes = 30, seed = 4)
p.imputed325.list <- mi.completed(p.imputed325) # extracting the data frames
df <- ldply(p.imputed325.list, data.frame) # rbinding
p.means325 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr325 <- merge(p.imp325wSRH, p.means325, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.wSRH325 <- cor(forcorr325$healthyrec[forcorr325$ifin == TRUE], 
                     forcorr325$imphealthyrec[forcorr325$ifin == TRUE], method = "spearman")
p.imp.wSRH325

## 375 missings
p.imp375wSRH <- imp375[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                         "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                         "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp375 <- p.imp375wSRH
p.imp375$healthyrec <- ifelse(p.imp375$ifin == "TRUE", NA, p.imp375$healthyrec)
p.imp375$ifin <- NULL
p.imputed375 <- mi(p.imp375, max.minutes = 30, seed = 4)
p.imputed375.list <- mi.completed(p.imputed375) # extracting the data frames
df <- ldply(p.imputed375.list, data.frame) # rbinding
p.means375 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr375 <- merge(p.imp375wSRH, p.means375, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.wSRH375 <- cor(forcorr375$healthyrec[forcorr375$ifin == TRUE], 
                     forcorr375$imphealthyrec[forcorr375$ifin == TRUE], method = "spearman")
p.imp.wSRH375

## 425 missings
p.imp425wSRH <- imp425[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                         "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                         "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp425 <- p.imp425wSRH
p.imp425$healthyrec <- ifelse(p.imp425$ifin == "TRUE", NA, p.imp425$healthyrec)
p.imp425$ifin <- NULL
p.imputed425 <- mi(p.imp425, max.minutes = 30, seed = 4)
p.imputed425.list <- mi.completed(p.imputed425) # extracting the data frames
df <- ldply(p.imputed425.list, data.frame) # rbinding
p.means425 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr425 <- merge(p.imp425wSRH, p.means425, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.wSRH425 <- cor(forcorr425$healthyrec[forcorr425$ifin == TRUE], 
                     forcorr425$imphealthyrec[forcorr425$ifin == TRUE], method = "spearman")
p.imp.wSRH425

## Having a look at all the correlations:
data.frame(p.imp.wSRH25, p.imp.wSRH75, p.imp.wSRH125, p.imp.wSRH175, p.imp.wSRH225,
           p.imp.wSRH275, p.imp.wSRH325, p.imp.wSRH375, p.imp.wSRH425)

## Re-doing the imputation with all variables but EXCLUDING adolescent SRH! ##

## 25 missings
p.imp25woSRH <- imp25[c("childid", "healthyrec", "female", "wi",
                        "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                        "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp25 <- p.imp25woSRH
p.imp25 <- p.imp25
p.imp25$healthyrec <- ifelse(p.imp25$ifin == "TRUE", NA, p.imp25$healthyrec)
p.imp25$ifin <- NULL
p.imputed25 <- mi(p.imp25, max.minutes = 30, seed = 4)
p.imputed25.list <- mi.completed(p.imputed25) # extracting the data frames
df <- ldply(p.imputed25.list, data.frame) # rbinding
p.means25 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr25 <- merge(p.imp25woSRH, p.means25, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.woSRH25 <- cor(forcorr25$healthyrec[forcorr25$ifin == TRUE], 
                     forcorr25$imphealthyrec[forcorr25$ifin == TRUE], method = "spearman")
p.imp.woSRH25

## 75 missings
p.imp75woSRH <- imp75[c("childid", "healthyrec", "female", "wi",
                        "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                        "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp75 <- p.imp75woSRH
p.imp75 <- p.imp75
p.imp75$healthyrec <- ifelse(p.imp75$ifin == "TRUE", NA, p.imp75$healthyrec)
p.imp75$ifin <- NULL
p.imputed75 <- mi(p.imp75, max.minutes = 30, seed = 4)
p.imputed75.list <- mi.completed(p.imputed75) # extracting the data frames
df <- ldply(p.imputed75.list, data.frame) # rbinding
p.means75 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr75 <- merge(p.imp75woSRH, p.means75, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.woSRH75 <- cor(forcorr75$healthyrec[forcorr75$ifin == TRUE], 
                     forcorr75$imphealthyrec[forcorr75$ifin == TRUE], method = "spearman")
p.imp.woSRH75

## 125 missings
p.imp125woSRH <- imp125[c("childid", "healthyrec", "female", "wi",
                          "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                          "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp125 <- p.imp125woSRH
p.imp125$healthyrec <- ifelse(p.imp125$ifin == "TRUE", NA, p.imp125$healthyrec)
p.imp125$ifin <- NULL
p.imputed125 <- mi(p.imp125, max.minutes = 40, seed = 4)
p.imputed125.list <- mi.completed(p.imputed125) # extracting the data frames
df <- ldply(p.imputed125.list, data.frame) # rbinding
p.means125 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr125 <- merge(p.imp125woSRH, p.means125, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.woSRH125 <- cor(forcorr125$healthyrec[forcorr125$ifin == TRUE], 
                      forcorr125$imphealthyrec[forcorr125$ifin == TRUE], method = "spearman")
p.imp.woSRH125

## 175 missings
p.imp175woSRH <- imp175[c("childid", "healthyrec", "female", "wi",
                          "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                          "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp175 <- p.imp175woSRH
p.imp175$healthyrec <- ifelse(p.imp175$ifin == "TRUE", NA, p.imp175$healthyrec)
p.imp175$ifin <- NULL
p.imputed175 <- mi(p.imp175, max.minutes = 40, seed = 4)
p.imputed175.list <- mi.completed(p.imputed175) # extracting the data frames
df <- ldply(p.imputed175.list, data.frame) # rbinding
p.means175 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr175 <- merge(p.imp175woSRH, p.means175, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.woSRH175 <- cor(forcorr175$healthyrec[forcorr175$ifin == TRUE], 
                      forcorr175$imphealthyrec[forcorr175$ifin == TRUE], method = "spearman")
p.imp.woSRH175

## 225 missings
p.imp225woSRH <- imp225[c("childid", "healthyrec", "female", "wi",
                          "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                          "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp225 <- p.imp225woSRH
p.imp225$healthyrec <- ifelse(p.imp225$ifin == "TRUE", NA, p.imp225$healthyrec)
p.imp225$ifin <- NULL
p.imputed225 <- mi(p.imp225, max.minutes = 40, seed = 4)
p.imputed225.list <- mi.completed(p.imputed225) # extracting the data frames
df <- ldply(p.imputed225.list, data.frame) # rbinding
p.means225 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr225 <- merge(p.imp225woSRH, p.means225, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.woSRH225 <- cor(forcorr225$healthyrec[forcorr225$ifin == TRUE], 
                      forcorr225$imphealthyrec[forcorr225$ifin == TRUE], method = "spearman")
p.imp.woSRH225

## 275 missings
p.imp275woSRH <- imp275[c("childid", "healthyrec", "female", "wi",
                          "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                          "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp275 <- p.imp275woSRH
p.imp275$healthyrec <- ifelse(p.imp275$ifin == "TRUE", NA, p.imp275$healthyrec)
p.imp275$ifin <- NULL
p.imputed275 <- mi(p.imp275, max.minutes = 40, seed = 4)
p.imputed275.list <- mi.completed(p.imputed275) # extracting the data frames
df <- ldply(p.imputed275.list, data.frame) # rbinding
p.means275 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr275 <- merge(p.imp275woSRH, p.means275, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.woSRH275 <- cor(forcorr275$healthyrec[forcorr275$ifin == TRUE], 
                      forcorr275$imphealthyrec[forcorr275$ifin == TRUE], method = "spearman")
p.imp.woSRH275

## 325 missings
p.imp325woSRH <- imp325[c("childid", "healthyrec", "female", "wi",
                          "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                          "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp325 <- p.imp325woSRH
p.imp325$healthyrec <- ifelse(p.imp325$ifin == "TRUE", NA, p.imp325$healthyrec)
p.imp325$ifin <- NULL
p.imputed325 <- mi(p.imp325, max.minutes = 40, seed = 4)
p.imputed325.list <- mi.completed(p.imputed325) # extracting the data frames
df <- ldply(p.imputed325.list, data.frame) # rbinding
p.means325 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr325 <- merge(p.imp325woSRH, p.means325, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.woSRH325 <- cor(forcorr325$healthyrec[forcorr325$ifin == TRUE], 
                      forcorr325$imphealthyrec[forcorr325$ifin == TRUE], method = "spearman")
p.imp.woSRH325

## 375 missings
p.imp375woSRH <- imp375[c("childid", "healthyrec", "female", "wi",
                          "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                          "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp375 <- p.imp375woSRH
p.imp375$healthyrec <- ifelse(p.imp375$ifin == "TRUE", NA, p.imp375$healthyrec)
p.imp375$ifin <- NULL
p.imputed375 <- mi(p.imp375, max.minutes = 40, seed = 4)
p.imputed375.list <- mi.completed(p.imputed375) # extracting the data frames
df <- ldply(p.imputed375.list, data.frame) # rbinding
p.means375 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr375 <- merge(p.imp375woSRH, p.means375, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.woSRH375 <- cor(forcorr375$healthyrec[forcorr375$ifin == TRUE], 
                      forcorr375$imphealthyrec[forcorr375$ifin == TRUE], method = "spearman")
p.imp.woSRH375

## 425 missings
p.imp425woSRH <- imp425[c("childid", "healthyrec", "female", "wi",
                          "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                          "puberty", "foodshrt", "anydis", "pschool", "aschool")]
p.imp425 <- p.imp425woSRH
p.imp425$healthyrec <- ifelse(p.imp425$ifin == "TRUE", NA, p.imp425$healthyrec)
p.imp425$ifin <- NULL
p.imputed425 <- mi(p.imp425, max.minutes = 40, seed = 4)
p.imputed425.list <- mi.completed(p.imputed425) # extracting the data frames
df <- ldply(p.imputed425.list, data.frame) # rbinding
p.means425 <- ddply(df, .(childid), summarize, imphealthyrec = mean(healthyrec)) # ave across mi data
forcorr425 <- merge(p.imp425woSRH, p.means425, by = "childid") # merging with the original data
# cor between actual data and imputation:
p.imp.woSRH425 <- cor(forcorr425$healthyrec[forcorr425$ifin == TRUE], 
                      forcorr425$imphealthyrec[forcorr425$ifin == TRUE], method = "spearman")
p.imp.woSRH425

## Having a look at all the correlations:
data.frame(p.imp.woSRH25, p.imp.woSRH75, p.imp.woSRH125, p.imp.woSRH175, p.imp.woSRH225,
           p.imp.woSRH275, p.imp.woSRH325, p.imp.woSRH375, p.imp.woSRH425)

## creating bar graph to easily compare correlations between actual and imputed 
## data for parent report of the adolescent's health

#### FIGURE 5 ####

# first constructing the data from from the correlations computed above...
cors <- c(p.imp.wSRH25, p.imp.wSRH75, p.imp.wSRH125, p.imp.wSRH175, p.imp.wSRH225,
          p.imp.wSRH275, p.imp.wSRH325, p.imp.wSRH375, p.imp.wSRH425,
          p.imp.woSRH25, p.imp.woSRH75, p.imp.woSRH125, p.imp.woSRH175, p.imp.woSRH225,
          p.imp.woSRH275, p.imp.woSRH325, p.imp.woSRH375, p.imp.woSRH425)
missings <- rep(c(25, 75, 125, 175, 225, 275, 325, 375, 425), times = 2)
Type <- c(rep(c("imputed with SRH"), times = 9), 
          rep(c("imputed without SRH"), times = 9))

mi.graph.parent <- data.frame(Type, missings, cors)

ggplot(data = mi.graph.parent, 
       aes(x = missings, y = cors, fill = Type)) + 
  geom_bar(stat = "identity", aes(fill = Type), position = position_dodge(), width = 25) + 
  scale_fill_manual(values = c("black","grey"), 
                    labels = c("imputed with adolescent SRH", "imputed without adolescent SRH")) +
  theme_bw() +
  labs(x = "% of observations set randomly to missing") +
  scale_x_continuous(breaks = c(25, 75, 125, 175, 225, 275, 325, 375, 425),
                     labels = c("0.07", "2.2", "3.5", "4.9", "6.4", "7.8", "9.2", "10.6", "12.0")) +
  ylab("Correlation") +
  theme(axis.text.x = element_text(size = 30),
        axis.title.x = element_text(face = "bold", size = 30),
        axis.text.y = element_text(size = 30),
        axis.title.y = element_text(face = "bold", size = 30),
        plot.title = element_text(size = 40), 
        legend.title = element_text(size = 30),
        legend.text = element_text(size = 30),
        legend.position = "top") 



## Imputing adolescent SRH with and without parent report

## First WITH parent report of the adolescent's health

## 25 missings
a.imp25wp <- imp25[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                     "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                     "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp25 <- a.imp25wp
a.imp25$chealthyrec <- ifelse(a.imp25$ifin == "TRUE", NA, a.imp25$chealthyrec)
a.imp25$ifin <- NULL
a.imputed25 <- mi(a.imp25, max.minutes = 30, seed = 4)
a.imputed25.list <- mi.completed(a.imputed25) # extracting the data frames
df <- ldply(a.imputed25.list, data.frame) # rbinding
a.means25 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr25 <- merge(a.imp25wp, a.means25, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wp25 <- cor(forcorr25$chealthyrec[forcorr25$ifin == TRUE], 
                  forcorr25$impchealthyrec[forcorr25$ifin == TRUE], method = "spearman")
a.imp.wp25

## 75 missings
a.imp75wp <- imp75[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                     "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                     "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp75 <- a.imp75wp
a.imp75$chealthyrec <- ifelse(a.imp75$ifin == "TRUE", NA, a.imp75$chealthyrec)
a.imp75$ifin <- NULL
a.imputed75 <- mi(a.imp75, max.minutes = 30, seed = 4)
a.imputed75.list <- mi.completed(a.imputed75) # extracting the data frames
df <- ldply(a.imputed75.list, data.frame) # rbinding
a.means75 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr75 <- merge(a.imp75wp, a.means75, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wp75 <- cor(forcorr75$chealthyrec[forcorr75$ifin == TRUE], 
                  forcorr75$impchealthyrec[forcorr75$ifin == TRUE], method = "spearman")
a.imp.wp75

## 125 missings
a.imp125wp <- imp125[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                       "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                       "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp125 <- a.imp125wp
a.imp125$chealthyrec <- ifelse(a.imp125$ifin == "TRUE", NA, a.imp125$chealthyrec)
a.imp125$ifin <- NULL
a.imputed125 <- mi(a.imp125, max.minutes = 30, seed = 4)
a.imputed125.list <- mi.completed(a.imputed125) # extracting the data frames
df <- ldply(a.imputed125.list, data.frame) # rbinding
a.means125 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr125 <- merge(a.imp125wp, a.means125, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wp125 <- cor(forcorr125$chealthyrec[forcorr125$ifin == TRUE], 
                   forcorr125$impchealthyrec[forcorr125$ifin == TRUE], method = "spearman")
a.imp.wp125

## 175 missings
a.imp175wp <- imp175[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                       "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                       "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp175 <- a.imp175wp
a.imp175$chealthyrec <- ifelse(a.imp175$ifin == "TRUE", NA, a.imp175$chealthyrec)
a.imp175$ifin <- NULL
a.imputed175 <- mi(a.imp175, max.minutes = 30, seed = 4)
a.imputed175.list <- mi.completed(a.imputed175) # extracting the data frames
df <- ldply(a.imputed175.list, data.frame) # rbinding
a.means175 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr175 <- merge(a.imp175wp, a.means175, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wp175 <- cor(forcorr175$chealthyrec[forcorr175$ifin == TRUE], 
                   forcorr175$impchealthyrec[forcorr175$ifin == TRUE], method = "spearman")
a.imp.wp175

## 225 missings
a.imp225wp <- imp225[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                       "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                       "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp225 <- a.imp225wp
a.imp225$chealthyrec <- ifelse(a.imp225$ifin == "TRUE", NA, a.imp225$chealthyrec)
a.imp225$ifin <- NULL
a.imputed225 <- mi(a.imp225, max.minutes = 30, seed = 4)
a.imputed225.list <- mi.completed(a.imputed225) # extracting the data frames
df <- ldply(a.imputed225.list, data.frame) # rbinding
a.means225 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr225 <- merge(a.imp225wp, a.means225, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wp225 <- cor(forcorr225$chealthyrec[forcorr225$ifin == TRUE], 
                   forcorr225$impchealthyrec[forcorr225$ifin == TRUE], method = "spearman")
a.imp.wp225

## 275 missings
a.imp275wp <- imp275[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                       "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                       "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp275 <- a.imp275wp
a.imp275$chealthyrec <- ifelse(a.imp275$ifin == "TRUE", NA, a.imp275$chealthyrec)
a.imp275$ifin <- NULL
a.imputed275 <- mi(a.imp275, max.minutes = 30, seed = 4)
a.imputed275.list <- mi.completed(a.imputed275) # extracting the data frames
df <- ldply(a.imputed275.list, data.frame) # rbinding
a.means275 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr275 <- merge(a.imp275wp, a.means275, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wp275 <- cor(forcorr275$chealthyrec[forcorr275$ifin == TRUE], 
                   forcorr275$impchealthyrec[forcorr275$ifin == TRUE], method = "spearman")
a.imp.wp275

## 325 missings
a.imp325wp <- imp325[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                       "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                       "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp325 <- a.imp325wp
a.imp325$chealthyrec <- ifelse(a.imp325$ifin == "TRUE", NA, a.imp325$chealthyrec)
a.imp325$ifin <- NULL
a.imputed325 <- mi(a.imp325, max.minutes = 30, seed = 4)
a.imputed325.list <- mi.completed(a.imputed325) # extracting the data frames
df <- ldply(a.imputed325.list, data.frame) # rbinding
a.means325 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr325 <- merge(a.imp325wp, a.means325, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wp325 <- cor(forcorr325$chealthyrec[forcorr325$ifin == TRUE], 
                   forcorr325$impchealthyrec[forcorr325$ifin == TRUE], method = "spearman")
a.imp.wp325

## 375 missings
a.imp375wp <- imp375[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                       "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                       "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp375 <- a.imp375wp
a.imp375$chealthyrec <- ifelse(a.imp375$ifin == "TRUE", NA, a.imp375$chealthyrec)
a.imp375$ifin <- NULL
a.imputed375 <- mi(a.imp375, max.minutes = 30, seed = 4)
a.imputed375.list <- mi.completed(a.imputed375) # extracting the data frames
df <- ldply(a.imputed375.list, data.frame) # rbinding
a.means375 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr375 <- merge(a.imp375wp, a.means375, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wp375 <- cor(forcorr375$chealthyrec[forcorr375$ifin == TRUE], 
                   forcorr375$impchealthyrec[forcorr375$ifin == TRUE], method = "spearman")
a.imp.wp375

## 425 missings
a.imp425wp <- imp425[c("childid", "healthyrec", "chealthyrec", "female", "wi",
                       "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                       "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp425 <- a.imp425wp
a.imp425$chealthyrec <- ifelse(a.imp425$ifin == "TRUE", NA, a.imp425$chealthyrec)
a.imp425$ifin <- NULL
a.imputed425 <- mi(a.imp425, max.minutes = 30, seed = 4)
a.imputed425.list <- mi.completed(a.imputed425) # extracting the data frames
df <- ldply(a.imputed425.list, data.frame) # rbinding
a.means425 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr425 <- merge(a.imp425wp, a.means425, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wp425 <- cor(forcorr425$chealthyrec[forcorr425$ifin == TRUE], 
                   forcorr425$impchealthyrec[forcorr425$ifin == TRUE], method = "spearman")
a.imp.wp425

## Having a look at all the correlations:
data.frame(a.imp.wp25, a.imp.wp75, a.imp.wp125, a.imp.wp175, a.imp.wp225, a.imp.wp275,
           a.imp.wp325, a.imp.wp375, a.imp.wp425)

## Re-doing the imputation with all variables but EXCLUDING parent SRH!

## 25 missings
a.imp25wop <- imp25[c("childid", "chealthyrec", "female", "wi",
                      "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                      "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp25 <- a.imp25wop
a.imp25$chealthyrec <- ifelse(a.imp25$ifin == "TRUE", NA, a.imp25$chealthyrec)
a.imp25$ifin <- NULL
a.imputed25 <- mi(a.imp25, max.minutes = 30, seed = 4)
a.imputed25.list <- mi.completed(a.imputed25) # extracting the data frames
df <- ldply(a.imputed25.list, data.frame) # rbinding
a.means25 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr25 <- merge(a.imp25wop, a.means25, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wop25 <- cor(forcorr25$chealthyrec[forcorr25$ifin == TRUE], 
                   forcorr25$impchealthyrec[forcorr25$ifin == TRUE], method = "spearman")
a.imp.wop25

## 75 missings
a.imp75wop <- imp75[c("childid", "chealthyrec", "female", "wi",
                      "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                      "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp75 <- a.imp75wop
a.imp75$chealthyrec <- ifelse(a.imp75$ifin == "TRUE", NA, a.imp75$chealthyrec)
a.imp75$ifin <- NULL
a.imputed75 <- mi(a.imp75, max.minutes = 30, seed = 4)
a.imputed75.list <- mi.completed(a.imputed75) # extracting the data frames
df <- ldply(a.imputed75.list, data.frame) # rbinding
a.means75 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr75 <- merge(a.imp75wop, a.means75, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wop75 <- cor(forcorr75$chealthyrec[forcorr75$ifin == TRUE], 
                   forcorr75$impchealthyrec[forcorr75$ifin == TRUE], method = "spearman")
a.imp.wop75

## 125 missings
a.imp125wop <- imp125[c("childid", "chealthyrec", "female", "wi",
                        "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                        "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp125 <- a.imp125wop
a.imp125$chealthyrec <- ifelse(a.imp125$ifin == "TRUE", NA, a.imp125$chealthyrec)
a.imp125$ifin <- NULL
a.imputed125 <- mi(a.imp125, max.minutes = 40, seed = 4)
a.imputed125.list <- mi.completed(a.imputed125) # extracting the data frames
df <- ldply(a.imputed125.list, data.frame) # rbinding
a.means125 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr125 <- merge(a.imp125wop, a.means125, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wop125 <- cor(forcorr125$chealthyrec[forcorr125$ifin == TRUE], 
                    forcorr125$impchealthyrec[forcorr125$ifin == TRUE], method = "spearman")
a.imp.wop125

## 175 missings
a.imp175wop <- imp175[c("childid", "chealthyrec", "female", "wi",
                        "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                        "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp175 <- a.imp175wop
a.imp175$chealthyrec <- ifelse(a.imp175$ifin == "TRUE", NA, a.imp175$chealthyrec)
a.imp175$ifin <- NULL
a.imputed175 <- mi(a.imp175, max.minutes = 40, seed = 4)
a.imputed175.list <- mi.completed(a.imputed175) # extracting the data frames
df <- ldply(a.imputed175.list, data.frame) # rbinding
a.means175 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr175 <- merge(a.imp175wop, a.means175, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wop175 <- cor(forcorr175$chealthyrec[forcorr175$ifin == TRUE], 
                    forcorr175$impchealthyrec[forcorr175$ifin == TRUE], method = "spearman")
a.imp.wop175

## 225 missings
a.imp225wop <- imp225[c("childid", "chealthyrec", "female", "wi",
                        "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                        "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp225 <- a.imp225wop
a.imp225$chealthyrec <- ifelse(a.imp225$ifin == "TRUE", NA, a.imp225$chealthyrec)
a.imp225$ifin <- NULL
a.imputed225 <- mi(a.imp225, max.minutes = 40, seed = 4)
a.imputed225.list <- mi.completed(a.imputed225) # extracting the data frames
df <- ldply(a.imputed225.list, data.frame) # rbinding
a.means225 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr225 <- merge(a.imp225wop, a.means225, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wop225 <- cor(forcorr225$chealthyrec[forcorr225$ifin == TRUE], 
                    forcorr225$impchealthyrec[forcorr225$ifin == TRUE], method = "spearman")
a.imp.wop225

## 275 missings
a.imp275wop <- imp275[c("childid", "chealthyrec", "female", "wi",
                        "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                        "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp275 <- a.imp275wop
a.imp275$chealthyrec <- ifelse(a.imp275$ifin == "TRUE", NA, a.imp275$chealthyrec)
a.imp275$ifin <- NULL
a.imputed275 <- mi(a.imp275, max.minutes = 40, seed = 4)
a.imputed275.list <- mi.completed(a.imputed275) # extracting the data frames
df <- ldply(a.imputed275.list, data.frame) # rbinding
a.means275 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr275 <- merge(a.imp275wop, a.means275, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wop275 <- cor(forcorr275$chealthyrec[forcorr275$ifin == TRUE], 
                    forcorr275$impchealthyrec[forcorr275$ifin == TRUE], method = "spearman")
a.imp.wop275

## 325 missings
a.imp325wop <- imp325[c("childid", "chealthyrec", "female", "wi",
                        "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                        "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp325 <- a.imp325wop
a.imp325$chealthyrec <- ifelse(a.imp325$ifin == "TRUE", NA, a.imp325$chealthyrec)
a.imp325$ifin <- NULL
a.imputed325 <- mi(a.imp325, max.minutes = 40, seed = 4)
a.imputed325.list <- mi.completed(a.imputed325) # extracting the data frames
df <- ldply(a.imputed325.list, data.frame) # rbinding
a.means325 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr325 <- merge(a.imp325wop, a.means325, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wop325 <- cor(forcorr325$chealthyrec[forcorr325$ifin == TRUE], 
                    forcorr325$impchealthyrec[forcorr325$ifin == TRUE], method = "spearman")
a.imp.wop325

## 375 missings
a.imp375wop <- imp375[c("childid", "chealthyrec", "female", "wi",
                        "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                        "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp375 <- a.imp375wop
a.imp375$chealthyrec <- ifelse(a.imp375$ifin == "TRUE", NA, a.imp375$chealthyrec)
a.imp375$ifin <- NULL
a.imputed375 <- mi(a.imp375, max.minutes = 40, seed = 4)
a.imputed375.list <- mi.completed(a.imputed375) # extracting the data frames
df <- ldply(a.imputed375.list, data.frame) # rbinding
a.means375 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr375 <- merge(a.imp375wop, a.means375, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wop375 <- cor(forcorr375$chealthyrec[forcorr375$ifin == TRUE], 
                    forcorr375$impchealthyrec[forcorr375$ifin == TRUE], method = "spearman")
a.imp.wop375

## 425 missings
a.imp425wop <- imp425[c("childid", "chealthyrec", "female", "wi",
                        "countrynum", "ifin", "mightdie", "longterm", "stunted", 
                        "puberty", "foodshrt", "anydis", "pschool", "aschool")]
a.imp425 <- a.imp425wop
a.imp425$chealthyrec <- ifelse(a.imp425$ifin == "TRUE", NA, a.imp425$chealthyrec)
a.imp425$ifin <- NULL
a.imputed425 <- mi(a.imp425, max.minutes = 40, seed = 4)
a.imputed425.list <- mi.completed(a.imputed425) # extracting the data frames
df <- ldply(a.imputed425.list, data.frame) # rbinding
a.means425 <- ddply(df, .(childid), summarize, impchealthyrec = mean(chealthyrec)) # ave across mi data
forcorr425 <- merge(a.imp425wop, a.means425, by = "childid") # merging with the original data
# cor between actual data and imputation:
a.imp.wop425 <- cor(forcorr425$chealthyrec[forcorr425$ifin == TRUE], 
                    forcorr425$impchealthyrec[forcorr425$ifin == TRUE], method = "spearman")
a.imp.wop425

## Having a look at all the correlations:
data.frame(a.imp.wop25, a.imp.wop75, a.imp.wop125, a.imp.wop175, a.imp.wop225,
           a.imp.wop275, a.imp.wop325, a.imp.wop375, a.imp.wop425)

## creating bar graph to easily compare correlations between actual and imputed 
## data for parent report of the adolescent's health

#### FIGURE 6 ####

# first constructing the data frome from the correlations computed above...
cors <- c(a.imp.wp25, a.imp.wp75, a.imp.wp125, a.imp.wp175, a.imp.wp225, a.imp.wp275,
          a.imp.wp325, a.imp.wp375, a.imp.wp425,
          a.imp.wop25, a.imp.wop75, a.imp.wop125, a.imp.wop175, a.imp.wop225,
          a.imp.wop275, a.imp.wop325, a.imp.wop375, a.imp.wop425)
missings <- rep(c(25, 75, 125, 175, 225, 275, 325, 375, 425), times = 2)
Type <- c(rep(c("imputed with parent report"), times = 9), 
          rep(c("imputed without parent report"), times = 9))

mi.graph.SRH <- data.frame(Type, missings, cors)

ggplot(data = mi.graph.SRH, 
       aes(x = missings, y = cors, fill = Type)) + 
  geom_bar(stat = "identity", aes(fill = Type), position=position_dodge(), width = 25) + 
  scale_fill_manual(values = c("black","grey"), 
                    labels = c("imputed with parent report", "imputed without parent report")) +
  theme_bw() +
  labs(x = "% of observations set randomly to missing") +
  scale_x_continuous(breaks = c(25, 75, 125, 175, 225, 275, 325, 375, 425),
                     labels = c("0.07", "2.2", "3.5", "4.9", "6.4", "7.8", "9.2", "10.6", "12.0")) +
  ylab("Correlation") +
  theme(axis.text.x = element_text(size = 30),
        axis.title.x = element_text(face = "bold", size = 30),
        axis.text.y = element_text(size = 30),
        axis.title.y = element_text(face = "bold", size = 30),
        plot.title = element_text(size = 40), 
        legend.title = element_text(size = 30),
        legend.text = element_text(size = 30),
        legend.position = "top")

