###############################################################
# TITLE:  ANALYSIS REPLICATION CODE                           #
# FOR:    Manuscript titled "Measuring Network Sizes in the   #
#         Context of Respondent Driven Sampling: Evidence     #
#         from Two Independent Surveys" accepted by           #
#         Survey Research Methods                             #
# BY:     Sunghee Lee (sungheel@umich.edu)                    #
# DATE:   October 6, 2024                                     #
###############################################################


###############################################################
# REQUIRED LIBRARIES                                          #
###############################################################

library(haven)
library(tidyverse)
library(doBy)
library(RDS)
library(ggplot2)
library(ggpubr)
library(gridExtra)
library(grid)
library(survey)
library(dplyr)
library(janitor)
library(scales)

###############################################################
# USER DEFINED FUNCTIONS                                      #
###############################################################

#Standard error
se<-function(x){
  sd(x,na.rm=T)/sqrt(length(x)-sum(is.na(x)))
  }
  
# Finding Recruiter ID in HLSK
#  (See ANALYSIS STEP 4 section about its use)
Find_Rec_ID = function(data){
  id1 = match(data$UNIQUE_NUM, data$COUPON1_NUM, nomatch = 0) 
  id2 = match(data$UNIQUE_NUM, data$COUPON2_NUM, nomatch = 0) 
  id = `+`(id1, id2)
  id[id == 0] = NA
  return(data$UNIQUE_NUM[id])
}

###############################################################
# DATA RETRIEVAL                                              #
###############################################################
#
# 1. Project Positive Attitudes Towards Health (PATH)
#   Data retrievable from https://www.icpsr.umich.edu/web/ICPSR/studies/37957
#   This manuscript uses both unrestricted and restricted data. 
#   The restricted data requires an application.
#
#   List of unrestricted variables used in the analyses:
#   M_IWCOMPLETE, M_AQ3_WHITE, M_AQ3_BLACK, M_NETSIZE, M_AQ1, IS_SC1
#
#   List of restricted variables used in the analyses:
#   M_AQ2, M_AQ3_AIAN, M_AQ3_ASIAN, M_AQ3_MENA, M_AQ3_NHPI, 
#   M_AQ3_OTHER, M_SNQ4, M_SNQ1_1, M_SNQ1_2, M_NUMBER_PAT, M_NUMBER_INT, 
#   M_NUMBER_CLS, LOCATION, M_HQ7, M_AQ6, M_HQ1, M_HQ3, M_HQ5, WAVE_NUM, 
#   SEED, RECRUITERID, COUPON3_NUM, COUPON1_NUM, COUPON2_NUM, CPN_NUM
#
# 2. Health and Life Study of Koreans (HLSK) 
#   Data retrievable from https://www.icpsr.umich.edu/web/ICPSR/studies/37635/
#   This manuscript uses both unrestricted and restricted data. 
#   The restricted data requires an application.
#
#   List of unrestricted variables used in the analyses:
#   MAIN_COMP, CPN_NUM, AQ7, AQ8, AQ8_INDICATOR, NUMKR, CPN_NUM, AQ1_1, 
#   BQ2, GQ9, LQ1, GQ6, INTVLANG, SITE
#
#   List of restricted variables used in the analyses:
#   ZQ_EXP_APPROACH, ZQ_EXP_FAMILY, ZQ5_1, ZQ5_2, ZQ5_3, ZQ5_4, AQ1, LQ3, 
#   UNIQUE_NUM, WAVE_NUM, COUPON1_NUM, COUPON2_NUM, SEED
# 
# 3. American Community Survey (ACS) 2012-2016 Public Use Microdata Sample (PUMS)
#   To access ACS 2012-2016 PUMS, please visit 
#   https://www2.census.gov/programs-surveys/acs/data/pums/2016/5-Year/ 
#   Download csv_hca.zip (CA household), csv_pca.zip (CA person), 
#            csv_hmi.zip (MI household), csv_pmi.zip (MI person)
#   
#   List of variables used: 
#   From household files: SERIALNO, HINCP 
#   From person files: SERIALNO, SPORDER, PUMA, ANC1P, ANC2P, 
#                      RAC2P, RAC3P, CIT, AGEP, SEX, SCHL, MAR, ESR
#                      PWGTP, PWGTP1:PWGTP80
# 
###############################################################


###############################################################
# DATA LOADING                                                #
###############################################################
#
# The loaded data sets are named as follows in the code: 
#   PATH 
#   HLSK  
#   ACS_CA_H, ACS_CA_P, ACS_MI_H, ACS_MI_P
#   --> ACS household and person data sets are merged separately 
#       for each site (LA and MI) 
#       and stacked across both sites; this data set is named ACS
#
###############################################################

# PATH Data

PATH<-read.table("YOUR FILE LOCATION AND YOUR FILE")%>%
  filter(!is.na(M_IWCOMPLETE))

# HLSK Data

HLSK<-read.table("YOUR FILE LOCATION AND YOUR FILE")%>%
  filter(MAIN_COMP==1)

# ACS CA Data

ACS_CA_H<-read.table("YOUR FILE LOCATION AND YOUR FILE")%>%
  select(SERIALNO, HINCP)

ACS_CA_P<-read.table("YOUR FILE LOCATION AND YOUR FILE")%>%
  select(SERIALNO, SPORDER, PUMA, ANC1P, ANC2P, 
         RAC2P, RAC3P, CIT, AGEP, SEX, SCHL, MAR, ESR, 
         PWGTP, PWGTP1:PWGTP80)

#ACS_LA_KR includes only those are in LA and are non-US born Koreans
ACS_LA_KR<-subset (ACS_CA_P, ((PUMA>=3701 & PUMA<=3769)) & 
                     (ANC1P==750|ANC2P==750|
                        RAC2P==49|RAC3P==8|RAC3P==22|
                        RAC3P==35|RAC3P==46|
                        RAC3P==54|RAC3P==98) &
                     CIT!=1 & AGEP>=18) 
#Merge with household data by SERIALNO
ACS_LA<-merge(ACS_LA_KR, ACS_CA_H, by="SERIALNO")

ACS_LA$Location<-"LA" 

# ACS MI Data

ACS_MI_H<-read.table("YOUR FILE LOCATION AND YOUR FILE")%>%
  select(SERIALNO, HINCP)

ACS_MI_P<-read.table("YOUR FILE LOCATION AND YOUR FILE")%>%
  select(SERIALNO, SPORDER, PUMA, ANC1P, ANC2P, 
         RAC2P, RAC3P, CIT, AGEP, SEX, SCHL, MAR, ESR, 
         PWGTP, PWGTP1:PWGTP80)

#ACS_MI_KR includes only those are non-US born Koreans in MI
ACS_MI_KR<-subset(ACS_MI_P, 
                  (ANC1P==750|ANC2P==750|
                     RAC2P==49|RAC3P==8|RAC3P==22|
                     RAC3P==35|RAC3P==46|
                     RAC3P==54|RAC3P==98) &
                    CIT!=1 & AGEP>=18) 
 
ACS_MI<-merge(ACS_MI_KR, ACS_MI_H, by="SERIALNO")

ACS_MI$Location<-"MI" 

#Stacking ACS LA and MI Data
ACS<-rbind(ACS_LA, ACS_MI)

rm(ACS_CA_P,ACS_CA_H,ACS_LA_KR,ACS_LA,
   ACS_MI_P,ACS_MI_KR,ACS_MI_H,ACS_MI)


###############################################################
# VARIABLE RECODING                                           #
###############################################################

# 1. PATH

# Participant characteristics variables (Table 1)

PATH<-PATH%>%mutate(
  Agecat=case_when( 
    is.na(IS_SC1)~"NA", 
    IS_SC1<30~"18-29",
    IS_SC1<40~"30s",
    IS_SC1<50~"40s",
    IS_SC1<60~"50s",
    TRUE~"60+"),
  
  Agecat3=case_when( 
    Agecat=="18-29"| Agecat=="30s" ~ "<40",
    Agecat=="40s" | Agecat=="50s" ~ "<60s",
    TRUE~"61+"),
  
  Male=ifelse(M_AQ1==2,0,M_AQ1), 

  Employed=case_when(
    is.na(M_HQ1)~NaN,  
    M_HQ1<3~1,
    TRUE~0),
  
  IncomeLE20=case_when(
    is.na(M_HQ3)~NaN,  
    M_HQ3<=20000~1,
    TRUE~0),
   
  IncomeLE20=ifelse(!is.na(IncomeLE20),IncomeLE20,ifelse(
    is.na(M_HQ5),NA,ifelse(
      M_HQ5%in% c(2,3),1,0))),
  
  
  NHWhite=case_when(
    M_AQ2==0         & M_AQ3_WHITE==1  
    & M_AQ3_AIAN!=1  & M_AQ3_ASIAN!=1  
    & M_AQ3_BLACK!=1 & M_AQ3_MENA!=1   
    & M_AQ3_NHPI!=1  & M_AQ3_OTHER!=1 ~ 1, 
    is.na(M_AQ2)~NaN, 
    TRUE~0),
   
  LivingAlone=case_when(
    is.na(M_HQ7)~NaN, 
    M_HQ7==1~1,
    TRUE~0),

  Location2=case_when( 
    LOCATION==1~"Det",  
    TRUE~"Non-Det"),
  
  Educ=case_when(
  M_AQ6<4~"LT HS", 
  M_AQ6<6~"HS",
  M_AQ6<=11~"MT HS",
  TRUE~"NA"))

# Degree variables

PATH<-PATH%>%mutate(
  
  D_TOTAL_STD=M_NETSIZE, 
  
  D_TOTAL_PRIME=M_SNQ4,
  
  D_TOTAL_STD_RTYPE=case_when(
    is.na(D_TOTAL_STD)~"1.NA",
    D_TOTAL_STD==0~"2.0",
    TRUE~"3.Non0"),                  
  
  D_TOTAL_PRIME_RTYPE=case_when(
    is.na(D_TOTAL_PRIME)~"1.NA",
    D_TOTAL_PRIME==0~"2.0",
    TRUE~"3.Non0"),
  
  D_TOTAL_COMP=case_when(
    is.na(D_TOTAL_STD) |  is.na(D_TOTAL_PRIME)~'NA',
    D_TOTAL_STD>D_TOTAL_PRIME~"1.STD>PRIME",
    D_TOTAL_STD==D_TOTAL_PRIME~"2.STD=PRIME",
    TRUE~"3.STD<PRIME"),
  
  D_PAT=M_NUMBER_PAT,
  
  D_TOTAL_PAT=case_when(
    is.na(M_NUMBER_PAT)~NaN,
    TRUE~round(PATH$M_NUMBER_PAT*(100/((1.04+1.2)/2)),0)), 
  
  D_INTERACT=M_NUMBER_INT,
  
  D_CLOSE=M_NUMBER_CLS)

# Log transformed degrees variables

PATH<-PATH%>%
  select(D_TOTAL_STD,
         D_TOTAL_PRIME,
         D_TOTAL_PAT,
         D_PAT,
         D_INTERACT,
         D_CLOSE)%>%
  mutate_all(.,~log(.+1))%>%
  rename_all(.,~paste0("LOG_",.))%>%
  bind_cols(PATH)

#Recruitment success

PATH<-PATH%>%mutate(
  RECRUIT_NUM=CPN_USED_NUM, 

  RECRUIT=case_when(
    is.na(CPN_USED_NUM)~NaN,
    CPN_USED_NUM==0~0,
    TRUE~1),
  
  SUCC=CPN_USED_NUM,
  
  FAIL=CPN_NUM-CPN_USED_NUM)



# 2. HLSK

# Participant characteristics variables (Table 1)

HLSK<-HLSK%>% mutate(
  Agecat=ifelse(AQ1<30,"18-29", ifelse(
    AQ1<40,"30s", ifelse(
      AQ1<50,"40s", ifelse(
        AQ1<60,"50s", ifelse(
          AQ1>=60|AQ1_1==7,"60+",NA))))),
  
  Age18_29=ifelse(AQ1<30&AQ1>17,1,0),
  
  Male=BQ1_MALE,
  
  Employed=ifelse(is.na(LQ1),NA, ifelse(
    LQ1==1,1,0)),
    
  IncomeMT50=ifelse(is.na(LQ3),NA, ifelse(
    LQ3>50000,1,0)),
    
  Identity=ifelse(GQ6==1,"Korean", ifelse(
    GQ6==3,"Korean-Am","Other")),
    
  Married=ifelse(is.na(BQ2),NA, ifelse(
    BQ2==1,1,0)),
    
  Site=SITE,
    
  Educ=ifelse(is.na(GQ9),NA, ifelse(
    GQ9<9,"LT BA/S", ifelse(
      GQ9==9,"BA/S","MT BA/S"))),
    
  Edu_COLL=ifelse(GQ9==9,1,0),
  
  Language=INTVLANG )

# Degree variables

HLSK<-HLSK%>% mutate(
  
  D_TOTAL= NUMKR,
  
  D_TOTAL_GT100=ifelse(is.na(D_TOTAL),NA,ifelse(
    D_TOTAL>100,1,0)),
  
  D_TOTAL_GT500=ifelse(is.na(D_TOTAL),NA,ifelse(
    D_TOTAL>500,1,0)),
  
  D_TOTAL_WOOUT=ifelse(is.na(D_TOTAL),NA, ifelse(
    D_TOTAL>=round(quantile(D_TOTAL, probs = .98, na.rm=T),0),
    round(quantile(D_TOTAL, probs = .98, na.rm=T),0), 
    D_TOTAL)),
  
  D_TOTAL_RESPONSE_TYPE=ifelse(is.na(ZQ_EXP_APPROACH),NA, ifelse(
    is.na(D_TOTAL),"1.NR", ifelse(
      D_TOTAL==0, "2.0","3.Non0"))),
  
  D_OH=ZQ5_1, 
  
  D_TOTAL_OH=ifelse(is.na(D_OH), NA, round(D_OH*(100/1.54),0)),
  
  D_OH_WOOUT=ifelse(is.na(D_OH),NA,
                    ifelse(D_OH>=round(quantile(D_OH, probs = .98, na.rm=T),0),
                           round(quantile(D_OH, probs = .98, na.rm=T),0), 
                           D_OH)),
  
  D_TOTAL_OH_WOOUT=ifelse(is.na(D_OH_WOOUT), NA, 
                          round(D_OH_WOOUT/(1.54/100),0)),
  
  D_FAM=ZQ5_2,  

  D_FAM_WOOUT=ifelse(is.na(D_FAM),NA,
                     ifelse(D_FAM>=round(quantile(D_FAM, probs = .98, na.rm=T),0),
                            round(quantile(D_FAM, probs = .98, na.rm=T),0), 
                            D_FAM)),
  
  D_INTERACT=ZQ5_3, 

  D_INTERACT_WOOUT=ifelse(is.na(D_INTERACT),NA,
                          ifelse(D_INTERACT>=round(quantile(D_INTERACT, probs = .98, na.rm=T),0),
                                 round(quantile(D_INTERACT, probs = .98, na.rm=T),0), 
                                 D_INTERACT)),
  
  D_CLOSE=ZQ5_4, 
  
  D_CLOSE_WOOUT=ifelse(is.na(D_CLOSE),NA,
                       ifelse(D_CLOSE>=round(quantile(D_CLOSE, probs = .98, na.rm=T),0),
                              round(quantile(D_CLOSE, probs = .98, na.rm=T),0), 
                              D_CLOSE)))


# Log transformed degrees variables

HLSK<-HLSK%>%
  select(D_TOTAL,D_TOTAL_WOOUT,
         D_OH,D_OH_WOOUT,
         D_TOTAL_OH,D_TOTAL_OH_WOOUT,
         D_FAM, D_FAM_WOOUT,
         D_INTERACT, D_INTERACT_WOOUT,
         D_CLOSE, D_CLOSE_WOOUT)%>%
  mutate_all(.,~log(.+1))%>%
  rename_all(.,~paste0(.,"_LOG"))%>%
  bind_cols(HLSK)


# Experimental variables

HLSK<-HLSK%>% mutate(
  
  D_TOTAL_EXP_APPROACH=ifelse(is.na(ZQ_EXP_APPROACH),NA, ifelse(
    ZQ_EXP_APPROACH==1, "Sequential", ifelse(
      ZQ_EXP_APPROACH==2, "Direct", "Direct, Coupon"))),
  
  QuestionApproach=ifelse(D_TOTAL_EXP_APPROACH=="","",ifelse(
    D_TOTAL_EXP_APPROACH=="Direct", "a.Standard (n=211)", ifelse(
      D_TOTAL_EXP_APPROACH=="Direct, Coupon", "b.RecruitPrime (n=219)",
        "c.Sequential (n=207)"))), 
      
  INTVLANG_BILINGUAL=ifelse(AQ7!=3, NA, ifelse(
    AQ7==3 & INTVLANG==1,"ENG","KOR")),
    
  InterviewLanguage=ifelse(is.na(INTVLANG_BILINGUAL), NA,ifelse(
    INTVLANG_BILINGUAL=="ENG","a.English (n=105)", "b.Korean (n=129)"))
    )

# Recruitment success

HLSK<-HLSK%>%mutate(
  RECRUIT_NUM=CPN_USED_NUM,
  
  RECRUIT=ifelse(is.na(CPN_USED_NUM),NA,
                 ifelse(CPN_USED_NUM==0,0,1)),
  
  SUCC=CPN_USED_NUM, 
  
  FAIL=CPN_NUM-CPN_USED_NUM)

 
  
# 3. ACS 

ACS<-ACS%>%mutate(
  # Age: 18-29 yrs
  Age18_29=ifelse(is.na(AGEP),NA,
                  ifelse(AGEP<30,1,0)),
  
  # Sex: Male
  Male=ifelse(is.na(SEX),NA,
              ifelse(SEX==1,1,0)),
  
  # Marital status: Married
  Married=ifelse(is.na(MAR),NA,
                 ifelse(MAR==1,1,0)),
  
  # Education: =4yr college degree 
  Educ_COLL=ifelse(is.na(SCHL),NA,
                   ifelse(SCHL==21,1,0)),
  # Income>$50K
  IncomeMT50=ifelse(is.na(HINCP), NA,
                    ifelse(HINCP<=50000,0,1)),
  
  # Employed last week
  Employed=ifelse(is.na(ESR),NA,
                  ifelse(ESR==3|ESR==6,0,1)))



###############################################################
# PARTICIPANT CHARACTERISTICS (TABLE 1)                       #
###############################################################

dim(PATH)
table(PATH$Agecat3)
table(PATH$Male)
table(PATH$Employed)
table(PATH$IncomeLE20)
table(PATH$NHWhite)
table(PATH$LivingAlone)
table(PATH$Location2)
table(PATH$Educ)

dim(HLSK)
table(HLSK$Agecat)
table(HLSK$Male)
table(HLSK$Employed)
table(HLSK$IncomeMT50)
table(HLSK$Identity)
table(HLSK$Married)
table(HLSK$Site)
table(HLSK$Educ)
table(HLSK$Language)

table(HLSK$Agecat, HLSK$Site)
table(HLSK$Male, HLSK$Site)
table(HLSK$Employed, HLSK$Site)
table(HLSK$IncomeMT50, HLSK$Site)
table(HLSK$Identity, HLSK$Site)
table(HLSK$Married, HLSK$Site)
table(HLSK$Site, HLSK$Site)
table(HLSK$Educ, HLSK$Site)
table(HLSK$Language, HLSK$Site)



###############################################################
# ANALYSIS STEP 1:                                            # 
# Degree Distribution                                         #
###############################################################

round(prop.table(table(is.na(PATH$D_TOTAL_STD))),3)
round(prop.table(table(is.na(PATH$D_TOTAL_PRIME))),3)
round(prop.table(table(is.na(PATH$D_PAT))),3)
round(prop.table(table(is.na(PATH$D_INTERACT))),3)
round(prop.table(table(is.na(PATH$D_CLOSE))),3)


round(prop.table(table(is.na(HLSK$D_TOTAL))),3)
round(prop.table(table(is.na(HLSK$D_OH))),3)
round(prop.table(table(is.na(HLSK$D_INTERACT))),3)
round(prop.table(table(is.na(HLSK$D_CLOSE))),3)
round(prop.table(table(is.na(HLSK$D_FAM))),3)

summary(HLSK$D_TOTAL)
summary(HLSK$D_OH)
summary(HLSK$D_INTERACT)
summary(HLSK$D_CLOSE)
summary(HLSK$D_FAM)
  
summary(HLSK$D_TOTAL_WOOUT)
summary(HLSK$D_OH_WOOUT)
summary(HLSK$D_INTERACT_WOOUT)
summary(HLSK$D_CLOSE_WOOUT)
summary(HLSK$D_FAM_WOOUT)

###############################################################
# FIGURE 4A                                                   #
###############################################################

wilcox.test(PATH$D_TOTAL_STD,PATH$D_TOTAL_PRIME,paired=TRUE)
wilcox.test(PATH$D_TOTAL_STD,PATH$D_TOTAL_PAT,paired=TRUE)
wilcox.test(PATH$D_TOTAL_PRIME,PATH$D_TOTAL_PAT,paired=TRUE)

prop.table(table(PATH$D_TOTAL_STD>PATH$D_TOTAL_PRIME))
prop.table(table(PATH$D_TOTAL_STD<PATH$D_TOTAL_PAT))

D_TOTAL_STD_h<-ggplot(PATH, aes(x=D_TOTAL_STD)) + 
  geom_histogram(color="navy", fill="navy", binwidth=5)+ 
  ylim(0, 225)+
  geom_vline(aes(xintercept=mean(D_TOTAL_STD, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_TOTAL_STD, na.rm=T)+20, y =200,
                label = paste0("Mean: ", round(mean(D_TOTAL_STD, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_STD, na.rm=T)+20, y=180,
                label=paste0("SD: ", round(sd(D_TOTAL_STD, na.rm=T),2))),
            fontface ="italic", 
            size = 4, color="black", hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_STD, na.rm=T)+20, y=160,
                label=paste0("CV: ", round(sd(D_TOTAL_STD, na.rm=T)/
                                             mean(D_TOTAL_STD, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_STD, na.rm=T)+20, y=140,
                label=paste0("Median: ", round(median(D_TOTAL_STD, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",hjust = 0)+
  labs(x="Degree: Total\nStandard", y="Sample Size")
 

D_TOTAL_PRIME_h<-ggplot(PATH, aes(x=D_TOTAL_PRIME)) + 
  geom_histogram(color="navy", fill="navy", binwidth=)+ 
  ylim(0, 225)+
  geom_vline(aes(xintercept=mean(D_TOTAL_PRIME, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_TOTAL_PRIME, na.rm=T)+10, y =200,
                label = paste0("Mean: ", round(mean(D_TOTAL_PRIME, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_PRIME, na.rm=T)+10, y=180,
                label=paste0("SD: ", round(sd(D_TOTAL_PRIME, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_PRIME, na.rm=T)+10, y=160,
                label=paste0("CV: ", 
                             round(sd(D_TOTAL_PRIME, na.rm=T)/
                                     mean(D_TOTAL_PRIME, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_PRIME, na.rm=T)+10, y=140,
                label=paste0("Median: ", round(median(D_TOTAL_PRIME, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Total\nRecruitment Primed", y="Sample Size")
 

D_INTERACT_h<-ggplot(PATH, aes(x=D_INTERACT)) + 
  geom_histogram(color="navy", fill="navy", binwidth=5)+ 
  ylim(0, 225)+
  geom_vline(aes(xintercept=mean(D_INTERACT, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_INTERACT, na.rm=T)+10, y =200,
                label = paste0("Mean: ", round(mean(D_INTERACT, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_INTERACT, na.rm=T)+10, y=180,
                label=paste0("SD: ", round(sd(D_INTERACT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_INTERACT, na.rm=T)+10, y=160,
                label=paste0("CV: ", round(sd(D_INTERACT, na.rm=T)/
                                             mean(D_INTERACT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_INTERACT, na.rm=T)+10, y=140,
                label=paste0("Median: ", round(median(D_INTERACT, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Interact\n>Once/Week", y="Sample Size")
 
D_CLOSE_h<-ggplot(PATH, aes(x=D_CLOSE)) + 
  geom_histogram(color="navy", fill="navy", binwidth=5)+ 
  ylim(0, 225)+
  geom_vline(aes(xintercept=mean(D_CLOSE, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_CLOSE, na.rm=T)+10, y =200,
                label = paste0("Mean: ", round(mean(D_CLOSE, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_CLOSE, na.rm=T)+10, y=180,
                label=paste0("SD: ", round(sd(D_CLOSE, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_CLOSE, na.rm=T)+10, y=160,
                label=paste0("CV: ", round(sd(D_CLOSE, na.rm=T)/
                                             mean(D_CLOSE, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_CLOSE, na.rm=T)+10, y=140,
                label=paste0("Median: ", round(median(D_CLOSE, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Close\nRelationship", y="Sample Size")
 
D_TOTAL_PAT_h<-ggplot(PATH, aes(x=D_TOTAL_PAT)) + 
  geom_histogram(color="navy", fill="navy", binwidth=100)+ 
  ylim(0, 225)+
  xlim(0, 2100)+
  geom_vline(aes(xintercept=mean(D_TOTAL_PAT, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_TOTAL_PAT, na.rm=T)+200, y =200,
                label = paste0("Mean: ", round(mean(D_TOTAL_PAT, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_PAT, na.rm=T)+200, y=180,
                label=paste0("SD: ", round(sd(D_TOTAL_PAT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_PAT, na.rm=T)+200, y=160,
                label=paste0("CV: ", round(sd(D_TOTAL_PAT, na.rm=T)/
                                             mean(D_TOTAL_PAT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_PAT, na.rm=T)+200, y=140,
                label=paste0("Median: ", round(median(D_TOTAL_PAT, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Total Scale-up\nFirst Name Pat", y="Sample Size")
 


fig4a<-ggarrange(D_TOTAL_STD_h, D_TOTAL_PRIME_h, D_TOTAL_PAT_h,
                D_INTERACT_h, D_CLOSE_h, ncol=3, nrow=2,
                common.legend = TRUE)
fig4a
#Saved as a pdf for a size7.5 x 5

###############################################################
# FIGURE 4B                                                   #
###############################################################


wilcox.test(HLSK$D_TOTAL_WOOUT,HLSK$D_TOTAL_OH_WOOUT,paired=TRUE)
prop.table(table(HLSK$D_TOTAL_WOOUT<HLSK$D_TOTAL_OH_WOOUT))

D_TOTAL_WOOUT_h<-ggplot(HLSK, aes(x=D_TOTAL_WOOUT)) + 
  geom_histogram(color="navy", fill="navy", binwidth=10)+ 
  ylim(0, 100)+
  geom_vline(aes(xintercept=mean(D_TOTAL_WOOUT, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_TOTAL_WOOUT, na.rm=T)+120, y =90,
                label = paste0("Mean: ", round(mean(D_TOTAL_WOOUT, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_WOOUT, na.rm=T)+120, y=80,
                label=paste0("SD: ", round(sd(D_TOTAL_WOOUT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_WOOUT, na.rm=T)+120, y=70,
                label=paste0("CV: ", round(sd(D_TOTAL_WOOUT, na.rm=T)/
                                             mean(D_TOTAL_WOOUT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_WOOUT, na.rm=T)+120, y=60,
                label=paste0("Median: ", round(median(D_TOTAL_WOOUT, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Total\n", y="Sample Size")
 
D_TOTAL_OH_WOOUT_h<-ggplot(HLSK, aes(x=D_TOTAL_OH_WOOUT)) + 
  geom_histogram(color="navy", fill="navy", binwidth=50)+ 
  ylim(0, 100)+
  geom_vline(aes(xintercept=mean(D_TOTAL_OH_WOOUT, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_TOTAL_OH_WOOUT, na.rm=T)+400, y =90,
                label = paste0("Mean: ", round(mean(D_TOTAL_OH_WOOUT, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_OH_WOOUT, na.rm=T)+400, y=80,
                label=paste0("SD: ", round(sd(D_TOTAL_OH_WOOUT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_OH_WOOUT, na.rm=T)+400, y=70,
                label=paste0("CV: ", round(sd(D_TOTAL_OH_WOOUT, na.rm=T)/
                                             mean(D_TOTAL_OH_WOOUT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_OH_WOOUT, na.rm=T)+400, y=60,
                label=paste0("Median: ", round(median(D_TOTAL_OH_WOOUT, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Total Scale-Up\nLast Name Oh", y="Sample Size")

D_FAM_WOOUT_h<-ggplot(HLSK, aes(x=D_FAM_WOOUT)) + 
  geom_histogram(color="navy", fill="navy", binwidth=1)+ 
  ylim(0, 100)+
  geom_vline(aes(xintercept=mean(D_FAM_WOOUT, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_FAM_WOOUT, na.rm=T)+5, y =90,
                label = paste0("Mean: ", round(mean(D_FAM_WOOUT, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_FAM_WOOUT, na.rm=T)+5, y=80,
                label=paste0("SD: ", round(sd(D_FAM_WOOUT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_FAM_WOOUT, na.rm=T)+5, y=70,
                label=paste0("CV: ", round(sd(D_FAM_WOOUT, na.rm=T)/
                                             mean(D_FAM_WOOUT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_FAM_WOOUT, na.rm=T)+5, y=60,
                label=paste0("Median: ", round(median(D_FAM_WOOUT, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Family\n", y="Sample Size")
 
D_INTERACT_WOOUT_h<-ggplot(HLSK, aes(x=D_INTERACT_WOOUT)) + 
  geom_histogram(color="navy", fill="navy", binwidth=1)+ 
  ylim(0, 100)+
  geom_vline(aes(xintercept=mean(D_INTERACT_WOOUT, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_INTERACT_WOOUT, na.rm=T)+8, y =90,
                label = paste0("Mean: ", round(mean(D_INTERACT_WOOUT, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_INTERACT_WOOUT, na.rm=T)+8, y=80,
                label=paste0("SD: ", round(sd(D_INTERACT_WOOUT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_INTERACT_WOOUT, na.rm=T)+8, y=70,
                label=paste0("CV: ", round(sd(D_INTERACT_WOOUT, na.rm=T)/
                                             mean(D_INTERACT_WOOUT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_INTERACT_WOOUT, na.rm=T)+8, y=60,
                label=paste0("Median: ", round(median(D_INTERACT_WOOUT, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Interact\n>Once/Week", y="Sample Size")
 
D_CLOSE_WOOUT_h<-ggplot(HLSK, aes(x=D_CLOSE_WOOUT)) + 
  geom_histogram(color="navy", fill="navy", binwidth=1)+ 
  ylim(0, 100)+
  geom_vline(aes(xintercept=mean(D_CLOSE_WOOUT, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_CLOSE_WOOUT, na.rm=T)+3, y =90,
                label = paste0("Mean: ", round(mean(D_CLOSE_WOOUT, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_CLOSE_WOOUT, na.rm=T)+3, y=80,
                label=paste0("SD: ", round(sd(D_CLOSE_WOOUT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_CLOSE_WOOUT, na.rm=T)+3, y=70,
                label=paste0("CV: ", round(sd(D_CLOSE_WOOUT, na.rm=T)/
                                             mean(D_CLOSE_WOOUT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_CLOSE_WOOUT, na.rm=T)+3, y=60,
                label=paste0("Median: ", round(median(D_CLOSE_WOOUT, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Close\nRelationship", y="Sample Size")


blank <- grid.rect(gp=gpar(col="white"))

fig4b<-ggarrange(D_TOTAL_WOOUT_h, D_TOTAL_OH_WOOUT_h, blank,
                D_INTERACT_WOOUT_h, D_CLOSE_WOOUT_h,  
                D_FAM_WOOUT_h,
                ncol=3, nrow=2,
                common.legend = TRUE)
fig4b

#Saved as a pdf for a size7.5 x 5
 

###############################################################
# FIGURE 5A                                                   #
# HLSK degrees by question approach                           #
###############################################################
 

summary(glm(D_TOTAL_WOOUT_LOG~QuestionApproach,
            data=subset(HLSK, family="Gamma"(link = "log"))))
summary(glm(D_OH_WOOUT_LOG~QuestionApproach,
            data=subset(HLSK, family="Gamma"(link = "log"))))
summary(glm(D_INTERACT_WOOUT_LOG~QuestionApproach,
            data=subset(HLSK, family="Gamma"(link = "log"))))
summary(glm(D_CLOSE_WOOUT_LOG~QuestionApproach,
            data=subset(HLSK, family="Gamma"(link = "log"))))
summary(glm(D_FAM_WOOUT_LOG~QuestionApproach,
            data=subset(HLSK, family="Gamma"(link = "log"))))

aggregate(D_TOTAL_WOOUT~QuestionApproach, HLSK, 
          FUN=function(x){c(n=length(x), m=mean(x), sd=sd(x))})
aggregate(D_FAM_WOOUT~QuestionApproach, HLSK, 
          FUN=function(x){c(n=length(x), m=mean(x), sd=sd(x))})


Total<-HLSK %>%
  select(QuestionApproach, Degree=D_TOTAL_WOOUT) %>%
  mutate(DegreeType="1. Total (b,c)")

Oh<-HLSK %>%
  select(QuestionApproach, Degree=D_OH_WOOUT) %>%
  mutate(DegreeType="2. Last Name Oh (b,c)")

Interact<-HLSK %>%
  select(QuestionApproach, Degree=D_INTERACT_WOOUT) %>%
  mutate(DegreeType="3. Interact >1/Wk (b)")

Close<-HLSK %>%
  select(QuestionApproach, Degree=D_CLOSE_WOOUT) %>%
  mutate(DegreeType="4. Close Relationship (b)")

Fam<-HLSK %>%
  select(QuestionApproach, Degree=D_FAM_WOOUT) %>%
  mutate(DegreeType="5. Family (b,c)")
 
Degree<-rbind(Total, Oh, Interact, Close, Fam)    
 

fig5a<-Degree%>% 
  ggplot(aes(x=DegreeType, y=Degree, fill=QuestionApproach)) +
  geom_boxplot(outlier.size=1) +
  ylim(0,100)+
  theme(legend.position=c(0.8, 0.7))+
  scale_x_discrete(labels = c("Total \n(b,c)","Last Name Oh \n(b,c)",
                              "Interact>Once/Wk\n(b)",
                              "Close Relationship\n(b)",
                              "Family \n(b,c)"))+
  scale_fill_manual(values=c("white", "slateblue3", "orange")) +
  ylab("Degree\u2020")+xlab("Degree Type\u2021") 

fig5a
#Saved as a pdf for a size6.6 x 4.4

table(Total$Degree>100)


###############################################################
# FIGURE 5B:                                                  #
# HLSK degrees by language -- Bilingual respondents           #
###############################################################

summary(glm(D_TOTAL_WOOUT_LOG~InterviewLanguage,
            data=subset(HLSK, !is.na(InterviewLanguage), 
                        family="Gamma"(link = "log"))))
summary(glm(D_OH_WOOUT_LOG~INTVLANG_BILINGUAL,
            data=subset(HLSK, !is.na(InterviewLanguage), 
                        family="Gamma"(link = "log"))))
summary(glm(D_INTERACT_WOOUT_LOG~InterviewLanguage,
            data=subset(HLSK, !is.na(InterviewLanguage), 
                        family="Gamma"(link = "log"))))
summary(glm(D_CLOSE_WOOUT_LOG~InterviewLanguage,
            data=subset(HLSK, !is.na(InterviewLanguage), 
                        family="Gamma"(link = "log"))))
summary(glm(D_FAM_WOOUT_LOG~InterviewLanguage,
            data=subset(HLSK, !is.na(InterviewLanguage), 
                        family="Gamma"(link = "log"))))

aggregate(D_TOTAL_WOOUT~InterviewLanguage, 
          data=subset(HLSK, !is.na(InterviewLanguage)), 
          FUN=function(x){c(n=length(x), m=mean(x), sd=sd(x))})
aggregate(D_OH_WOOUT~InterviewLanguage, 
          data=subset(HLSK, !is.na(InterviewLanguage)), 
          FUN=function(x){c(n=length(x), m=mean(x), sd=sd(x))})
aggregate(D_INTERACT_WOOUT~InterviewLanguage, 
          data=subset(HLSK, !is.na(InterviewLanguage)), 
          FUN=function(x){c(n=length(x), m=mean(x), sd=sd(x))})
aggregate(D_CLOSE_WOOUT~InterviewLanguage, 
          data=subset(HLSK, !is.na(InterviewLanguage)), 
          FUN=function(x){c(n=length(x), m=mean(x), sd=sd(x))})


Total_b<-HLSK%>%filter(!is.na(InterviewLanguage))%>%
  select(InterviewLanguage, Degree=D_TOTAL_WOOUT)%>%
  mutate(DegreeType="1. Total")


Oh_b<-HLSK%>%
  filter(!is.na(HLSK$InterviewLanguage))%>%
  select(InterviewLanguage, Degree=D_OH_WOOUT)%>%
  mutate(DegreeType="2. Last Name Oh (b)")

Interact_b<-HLSK%>%
  filter(!is.na(HLSK$InterviewLanguage))%>%
  select(InterviewLanguage, Degree=D_INTERACT_WOOUT)%>%
  mutate(DegreeType="3. Interact>1/Wk")

Close_b<-HLSK%>%
  filter(!is.na(HLSK$InterviewLanguage))%>%
  select(InterviewLanguage, Degree=D_CLOSE_WOOUT)%>%
  mutate(DegreeType="4. Close Relationship (b)")

Fam_b<-HLSK%>%
  filter(!is.na(HLSK$InterviewLanguage))%>%
  select(InterviewLanguage, Degree=D_FAM_WOOUT)%>%
  mutate(DegreeType="5. Family")

Degree_b<-rbind(Total_b, Oh_b, Interact_b, Close_b, Fam_b)    

fig5b<-Degree_b%>% 
  #filter(!(DegreeType %in% c( "2. Total Oh_b Scale-Up"))) %>%
  ggplot(aes(x=DegreeType, y=Degree, fill=InterviewLanguage)) +
  geom_boxplot(outlier.size=1) +
  ylim(0,100)+
  theme(legend.position=c(0.8, 0.7))+
  scale_fill_manual(values=c("plum1","lightblue"))+
  ylab("Degree\u2020")+xlab("Degree Type\u2021")+
  scale_x_discrete(labels = c("Total","Last Name Oh \n(b)",
                              "Interact>Once/Wk\n(b)",
                              "Close Relationship\n(b)",
                              "Family"))


fig5b
#Saved as a pdf for a size6.6 x 4.4

table(Total_b$Degree>100)




###############################################################
# ANALYSIS STEP 2:                                            #
# Relationship across degrees                                 # 
###############################################################


###############################################################
# FIGURE 6A                                                   #
###############################################################

D_TOTAL_STD_PRIME_s<-ggscatter(PATH, x = "D_TOTAL_STD", y = "D_TOTAL_PRIME", 
                               color= "navy",fill="navy",size=1.25,
                               add = "reg.line",add.params = list(color = "red"),
                               ggtheme=theme_gray()) + ylim(0, 100)+
  stat_cor(label.x = 90, label.y = 75, size=4, method="spearman",
           cor.coef.name="rho",p.accuracy = 0.001)+
  geom_jitter(width = 10, height = 5,size=1, color="navy") +
  labs(x="Degree: Total Standard\n", y="\nDegree: Total Recruitment Primed")

D_TOTAL_STD_PAT_s<-ggscatter(PATH, x = "D_TOTAL_STD", y = "D_TOTAL_PAT", 
                             color= "navy",fill="navy",size=1.25,
                             add = "reg.line",add.params = list(color = "red"),
                             ggtheme=theme_gray()) + ylim(0, 2000)+
  stat_cor(label.x = 90, label.y = 1200, size=4, method="spearman",
           cor.coef.name="rho",p.accuracy = 0.001)+
  geom_jitter(width = 12, height = 50, size=1, color="navy") +
  labs(x="Degree: Total Standard\n", y="Degree: Total Scale-up\nFirst Name Pat")

D_TOTAL_STD_INTERACT_s<-ggscatter(PATH, x = "D_TOTAL_STD", y = "D_INTERACT", 
                                  color= "navy",fill="navy",size=1.25,
                                  add = "reg.line",add.params = list(color = "red"),
                                  ggtheme=theme_gray()) + ylim(0, 100)+
  stat_cor(label.x = 90, label.y = 75, size=4, method="spearman",
           cor.coef.name="rho",p.accuracy = 0.001)+
  geom_jitter(width = 12, height = 50, size=1, color="navy") +
  labs(x="Degree: Total Standard\n", y="\nDegree: Interact >Once/Week")

D_TOTAL_STD_CLOSE_s<-ggscatter(PATH, x = "D_TOTAL_STD", y = "D_CLOSE", 
                               color= "navy",fill="navy",size=1.25,
                               add = "reg.line",add.params = list(color = "red"),
                               ggtheme=theme_gray()) + ylim(0, 100)+
  stat_cor(label.x = 90, label.y = 70, size=4, method="spearman",
           cor.coef.name="rho",p.accuracy = 0.001)+
  geom_jitter(width = 12, height = 50,size=1, color="navy") +
  labs(x="Degree: Total Standard\n", y="\nDegree: Close Relationship")
 

fig6a<-ggarrange(D_TOTAL_STD_PRIME_s,
                D_TOTAL_STD_PAT_s,
                D_TOTAL_STD_INTERACT_s,
                D_TOTAL_STD_CLOSE_s,
                nrow=2,ncol=2)

fig6a
#Saved as a pdf for a size6.3x6.3

###############################################################
# FIGURE 6B                                                   #
###############################################################

D_TOTAL_OH_s<-ggscatter(HLSK, x = "D_TOTAL_WOOUT", y = "D_TOTAL_OH_WOOUT",  
                        color= "navy", fill="navy", size=1.25,
                        add = "reg.line", add.params = list(color = "red"),
                        ggtheme=theme_gray()) + ylim(0,800)+ 
  stat_cor(label.x = 220, label.y = 600, size=4, 
           method="spearman",cor.coef.name="rho",p.accuracy = 0.001)+
  geom_jitter(width = 12, height = 50, size=1,color= "navy") +
  labs(x="Degree: Total\n", y="Degree: Total Scale-up\nLast Name Oh")

D_TOTAL_FAM_s<-ggscatter(HLSK, x = "D_TOTAL_WOOUT", y = "D_FAM_WOOUT",  
                         color= "navy", fill="navy", size=1.25,
                         add = "reg.line", add.params = list(color = "red"),
                         ggtheme=theme_gray()) + ylim(0,20)+ 
  stat_cor(label.x = 200, label.y = 10, size=4, 
           method="spearman",cor.coef.name="rho",p.accuracy = 0.001)+
  geom_jitter(width = 12, height = 4, size=1,color= "navy") +
  labs(x="Degree: Total\n", y="Degree: Family")

D_TOTAL_INTERACT_s<-ggscatter(HLSK, x = "D_TOTAL_WOOUT", y = "D_INTERACT_WOOUT",  
                              color= "navy", fill="navy", size=1.25,
                              add = "reg.line", add.params = list(color = "red"),
                              ggtheme=theme_gray()) + ylim(0,50)+ 
  stat_cor(label.x = 220, label.y = 600, size=4, 
           method="spearman",cor.coef.name="rho",p.accuracy = 0.001)+
  geom_jitter(width = 12, height = 5, size=1,color= "navy") +
  labs(x="Degree: Total\n", y="\nDegree: Interact >Once/Week")

D_TOTAL_CLOSE_s<-ggscatter(HLSK, x = "D_TOTAL_WOOUT", y = "D_CLOSE_WOOUT",  
                           color= "navy", fill="navy", size=1.25,
                           add = "reg.line", add.params = list(color = "red"),
                           ggtheme=theme_gray()) + ylim(0,20)+ 
  stat_cor(label.x = 220, label.y = 11, size=4, 
           method="spearman",cor.coef.name="rho",p.accuracy = 0.001)+
  geom_jitter(width = 12, height = 4, size=1,color= "navy") +
  labs(x="Degree: Total\n", y="\nDegree: Close Relationship")


fig6b<-ggarrange(D_TOTAL_OH_s,
                D_TOTAL_INTERACT_s,
                D_TOTAL_CLOSE_s,
                D_TOTAL_FAM_s,
                nrow=2,ncol=2)

fig6b

#Saved as a pdf for a size6.3x6.3


###############################################################
# ANALYSIS STEP 3:                                            #
# Relationship between degrees and recruitment success        #
###############################################################

summary(PATH$RECRUIT_NUM/PATH$CPN_NUM) 

PATH %>%
  tabyl(CPN_NUM, RECRUIT_NUM)%>%
  adorn_totals(c("row","col"))

summary(HLSK$RECRUIT_NUM/HLSK$CPN_NUM) 

HLSK %>%
  tabyl(CPN_NUM, RECRUIT_NUM)%>%
  adorn_totals(c("row","col"))


###############################################################
# TABLE 2A                                                    # 
###############################################################

zero<-PATH[which(PATH$RECRUIT_NUM==0),]
one<-PATH[which(PATH$RECRUIT_NUM==1),]
two<-PATH[which(PATH$RECRUIT_NUM==2),]
three<-PATH[which(PATH$RECRUIT_NUM==3),]

table_2a<-cbind(
  rbind(mean(zero$D_TOTAL_STD, na.rm=T),
        mean(zero$D_TOTAL_PRIME, na.rm=T),
        mean(zero$D_TOTAL_PAT, na.rm=T),
        mean(zero$D_INTERACT, na.rm=T),
        mean(zero$D_CLOSE, na.rm=T)),
  rbind(se(zero$D_TOTAL_STD),
        se(zero$D_TOTAL_PRIME),
        se(zero$D_TOTAL_PAT),
        se(zero$D_INTERACT),
        se(zero$D_CLOSE)),
  rbind(mean(one$D_TOTAL_STD, na.rm=T),
        mean(one$D_TOTAL_PRIME, na.rm=T),
        mean(one$D_TOTAL_PAT, na.rm=T),
        mean(one$D_INTERACT, na.rm=T),
        mean(one$D_CLOSE, na.rm=T)),
  rbind(se(one$D_TOTAL_STD),
        se(one$D_TOTAL_PRIME),
        se(one$D_TOTAL_PAT),
        se(one$D_INTERACT),
        se(one$D_CLOSE)),
  rbind(mean(two$D_TOTAL_STD, na.rm=T),
        mean(two$D_TOTAL_PRIME, na.rm=T),
        mean(two$D_TOTAL_PAT, na.rm=T),
        mean(two$D_INTERACT, na.rm=T),
        mean(two$D_CLOSE, na.rm=T)),
  rbind(se(two$D_TOTAL_STD),
        se(two$D_TOTAL_PRIME),
        se(two$D_TOTAL_PAT),
        se(two$D_INTERACT),
        se(two$D_CLOSE)),
  rbind(mean(three$D_TOTAL_STD, na.rm=T),
        mean(three$D_TOTAL_PRIME, na.rm=T),
        mean(three$D_TOTAL_PAT, na.rm=T),
        mean(three$D_INTERACT, na.rm=T),
        mean(three$D_CLOSE, na.rm=T)),
  rbind(se(three$D_TOTAL_STD),
        se(three$D_TOTAL_PRIME),
        se(three$D_TOTAL_PAT),
        se(three$D_INTERACT),
        se(three$D_CLOSE))) 

round(table_2a,1)


###############################################################
# TABLE 2B                                                    #
###############################################################

zero<-HLSK[which(HLSK$RECRUIT_NUM==0),]
one<-HLSK[which(HLSK$RECRUIT_NUM==1),]
two<-HLSK[which(HLSK$RECRUIT_NUM==2),]


table_2b<-cbind(
  
  #Mean of recruit counts
  rbind(mean(zero$D_TOTAL_WOOUT, na.rm=T),
    mean(zero$D_TOTAL_OH_WOOUT, na.rm=T),
    mean(zero$D_INTERACT_WOOUT, na.rm=T),
    mean(zero$D_CLOSE_WOOUT, na.rm=T),
    mean(zero$D_FAM_WOOUT, na.rm=T)),
  rbind(se(zero$D_TOTAL_WOOUT),
        se(zero$D_TOTAL_OH_WOOUT),
        se(zero$D_INTERACT_WOOUT),
        se(zero$D_CLOSE_WOOUT),
        se(zero$D_FAM_WOOUT)),
  rbind(mean(one$D_TOTAL_WOOUT, na.rm=T),
    mean(one$D_TOTAL_OH_WOOUT, na.rm=T),
    mean(one$D_INTERACT_WOOUT, na.rm=T),
    mean(one$D_CLOSE_WOOUT, na.rm=T),
    mean(one$D_FAM_WOOUT, na.rm=T)),
  rbind(se(one$D_TOTAL_WOOUT),
        se(one$D_TOTAL_OH_WOOUT),
        se(one$D_INTERACT_WOOUT),
        se(one$D_CLOSE_WOOUT),
        se(one$D_FAM_WOOUT)),
  rbind(mean(two$D_TOTAL_WOOUT, na.rm=T),
    mean(two$D_TOTAL_OH_WOOUT, na.rm=T),
    mean(two$D_INTERACT_WOOUT, na.rm=T),
    mean(two$D_CLOSE_WOOUT, na.rm=T),
    mean(two$D_FAM_WOOUT, na.rm=T)), 
  rbind(se(two$D_TOTAL_WOOUT),
    se(two$D_TOTAL_OH_WOOUT),
    se(two$D_INTERACT_WOOUT),
    se(two$D_CLOSE_WOOUT),
    se(two$D_FAM_WOOUT))) 

round(table_2b,1)
 

###############################################################
# TABLE 3A                                                    #
###############################################################

PATH_m<-na.omit(
  PATH[,c("SUCC","FAIL","CPN_NUM",
          "Agecat3","Male","NHWhite", 
          "Educ","Employed","IncomeLE20",
          "LivingAlone","Location2",
          "LOG_D_TOTAL_STD","LOG_D_TOTAL_PRIME","LOG_D_TOTAL_PAT",
          "LOG_D_INTERACT","LOG_D_CLOSE")])


m_null <- glm(cbind(SUCC,FAIL) ~ factor(CPN_NUM)+factor(Agecat3)+Male+NHWhite+
                LivingAlone+factor(Educ)+Employed+factor(IncomeLE20)+
                factor(Location2), family="quasibinomial", data=PATH_m)

 
m_D_TOTAL_STD <- glm(cbind(SUCC,FAIL) ~ factor(CPN_NUM)+factor(Agecat3)+Male+NHWhite+
                           LivingAlone+factor(Educ)+Employed+factor(IncomeLE20)+
                           factor(Location2)+
                           scale(LOG_D_TOTAL_STD), family="quasibinomial", data=PATH_m)

m_D_TOTAL_PRIME <- glm(cbind(SUCC,FAIL) ~ factor(CPN_NUM)+factor(Agecat3)+Male+NHWhite+
                             LivingAlone+factor(Educ)+Employed+factor(IncomeLE20)+
                             factor(Location2)+
                             scale(LOG_D_TOTAL_PRIME), family="quasibinomial", data=PATH_m)

m_D_TOTAL_PAT <- glm(cbind(SUCC,FAIL) ~ factor(CPN_NUM)+factor(Agecat3)+Male+NHWhite+
                           LivingAlone+factor(Educ)+Employed+factor(IncomeLE20)+
                           factor(Location2)+
                           scale(LOG_D_TOTAL_PAT), family="quasibinomial", data=PATH_m)

m_D_INTERACT <- glm(cbind(SUCC,FAIL) ~ factor(CPN_NUM)+factor(Agecat3)+Male+NHWhite+
                          LivingAlone+factor(Educ)+Employed+factor(IncomeLE20)+
                          factor(Location2)+
                          scale(LOG_D_INTERACT), family="quasibinomial", data=PATH_m)

m_D_CLOSE <- glm(cbind(SUCC,FAIL) ~ factor(CPN_NUM)+factor(Agecat3)+Male+NHWhite+
                       LivingAlone+factor(Educ)+Employed+factor(IncomeLE20)+
                       factor(Location2)+
                       scale(LOG_D_CLOSE), family="quasibinomial", data=PATH_m)
summary(m_null)

table_3a<-cbind(
  rbind(m_null$deviance,
        m_D_TOTAL_STD$deviance-m_null$deviance,
        m_D_TOTAL_PRIME$deviance-m_null$deviance,
        m_D_TOTAL_PAT$deviance-m_null$deviance,
        m_D_INTERACT$deviance-m_null$deviance,
        m_D_CLOSE$deviance-m_null$deviance), 
  
  rbind(NA,
        round(coef(summary(m_D_TOTAL_STD))[14,1],3),
        round(coef(summary(m_D_TOTAL_PRIME))[14,1],3),
        round(coef(summary(m_D_TOTAL_PAT))[14,1],3),
        round(coef(summary(m_D_INTERACT))[14,1],3),
        round(coef(summary(m_D_CLOSE))[14,1],3)),
  
  rbind(NA,
        round(coef(summary(m_D_TOTAL_STD))[14,2],3),
        round(coef(summary(m_D_TOTAL_PRIME))[14,2],3),
        round(coef(summary(m_D_TOTAL_PAT))[14,2],3),
        round(coef(summary(m_D_INTERACT))[14,2],3),
        round(coef(summary(m_D_CLOSE))[14,2],3)),
  
  rbind(NA,
        round(coef(summary(m_D_TOTAL_STD))[14,4],3),
        round(coef(summary(m_D_TOTAL_PRIME))[14,4],3),
        round(coef(summary(m_D_TOTAL_PAT))[14,4],3),
        round(coef(summary(m_D_INTERACT))[14,4],3),
        round(coef(summary(m_D_CLOSE))[14,4],3)))

round(table_3a,3)



###############################################################
# TABLE 3B                                                    #
###############################################################

HLSK_m<-na.omit(
  HLSK[,c("SUCC","FAIL","Agecat","Male","Married",
          "Educ","Employed","IncomeMT50","Language","Site","Identity",
          "D_TOTAL_LOG","D_TOTAL_OH_LOG","D_FAM_LOG",
          "D_INTERACT_LOG","D_CLOSE_LOG")])

m_null<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
               factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Language)+factor(Site)+factor(Identity), 
             HLSK_m, family=quasibinomial)

m_D_TOTAL_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                      factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Language)+factor(Site)+factor(Identity)+
                      scale(D_TOTAL_LOG), 
                    HLSK_m, family=quasibinomial)

m_D_TOTAL_OH_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                   factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Language)+factor(Site)+factor(Identity)+
                   scale(D_TOTAL_OH_LOG), 
                 HLSK_m, family=quasibinomial)

m_D_INTERACT_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                         factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Language)+factor(Site)+factor(Identity)+
                         scale(D_INTERACT_LOG), 
                       HLSK_m, family=quasibinomial)

m_D_CLOSE_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                      factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Language)+factor(Site)+factor(Identity)+
                      scale(D_CLOSE_LOG), 
                    HLSK_m, family=quasibinomial)


m_D_FAM_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                    factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Language)+factor(Site)+factor(Identity)+
                    scale(D_FAM_LOG), 
                  HLSK_m, family=quasibinomial)
 
summary(m_null)

table_3b<-cbind(
  rbind(m_null$deviance,
        m_D_TOTAL_LOG$deviance-m_null$deviance,
        m_D_TOTAL_OH_LOG$deviance-m_null$deviance,
        m_D_INTERACT_LOG$deviance-m_null$deviance,
        m_D_CLOSE_LOG$deviance-m_null$deviance,
        m_D_FAM_LOG$deviance-m_null$deviance), 

  rbind(NA,
        round(coef(summary(m_D_TOTAL_LOG))[16,1],3),
        round(coef(summary(m_D_TOTAL_OH_LOG))[16,1],3),
        round(coef(summary(m_D_INTERACT_LOG))[16,1],3),
        round(coef(summary(m_D_CLOSE_LOG))[16,1],3),
        round(coef(summary(m_D_FAM_LOG))[16,1],3)),

  rbind(NA,
        round(coef(summary(m_D_TOTAL_LOG))[16,2],3),
        round(coef(summary(m_D_TOTAL_OH_LOG))[16,2],3),
        round(coef(summary(m_D_INTERACT_LOG))[16,2],3),
        round(coef(summary(m_D_CLOSE_LOG))[16,2],3),
        round(coef(summary(m_D_FAM_LOG))[16,2],3)),
  
  rbind(NA,
        round(coef(summary(m_D_TOTAL_LOG))[16,4],3),
        round(coef(summary(m_D_TOTAL_OH_LOG))[16,4],3),
        round(coef(summary(m_D_INTERACT_LOG))[16,4],3),
        round(coef(summary(m_D_CLOSE_LOG))[16,4],3),
        round(coef(summary(m_D_FAM_LOG))[16,4],3)))

round(table_3a,3)

###############################################################
# TABLE 3C                                                    #
###############################################################

HLSK_bi<-na.omit(
  HLSK[,c("SUCC","FAIL","Agecat","Male","Married",
          "Educ","Employed","IncomeMT50","Language","Site","Identity",
          "D_TOTAL_LOG","D_TOTAL_OH_LOG","D_FAM_LOG",
          "D_INTERACT_LOG","D_CLOSE_LOG",
          "InterviewLanguage")])


# ENGLISH INTERVIEWS

e_null<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
               factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity), 
             subset(HLSK_bi, Language==1), family=quasibinomial)

e_D_TOTAL_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                      factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity)+
                      scale(D_TOTAL_LOG), 
                    subset(HLSK_bi, Language==1), family=quasibinomial)

e_D_FAM_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                    factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity)+
                    scale(D_FAM_LOG), 
                  subset(HLSK_bi, Language==1), family=quasibinomial)

e_D_TOTAL_OH_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                   factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity)+
                   scale(D_TOTAL_OH_LOG), 
                 subset(HLSK_bi, Language==1), family=quasibinomial)

e_D_INTERACT_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                         factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity)+
                         scale(D_INTERACT_LOG), 
                       subset(HLSK_bi, Language==1), family=quasibinomial)

e_D_CLOSE_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                      factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity)+
                      scale(D_CLOSE_LOG), 
                    subset(HLSK_bi, Language==1), family=quasibinomial)

summary(e_null)

table_3c_EN<-cbind(
  rbind(e_null$deviance,
        e_D_TOTAL_LOG$deviance-e_null$deviance,
        e_D_TOTAL_OH_LOG$deviance-e_null$deviance,
        e_D_INTERACT_LOG$deviance-e_null$deviance,
        e_D_CLOSE_LOG$deviance-e_null$deviance,
        e_D_FAM_LOG$deviance-e_null$deviance), 
  
  rbind(NA,
        round(coef(summary(e_D_TOTAL_LOG))[15,1],3),
        round(coef(summary(e_D_TOTAL_OH_LOG))[15,1],3),
        round(coef(summary(e_D_INTERACT_LOG))[15,1],3),
        round(coef(summary(e_D_CLOSE_LOG))[15,1],3),
        round(coef(summary(e_D_FAM_LOG))[15,1],3)),
  
  rbind(NA,
        round(coef(summary(e_D_TOTAL_LOG))[15,2],3),
        round(coef(summary(e_D_TOTAL_OH_LOG))[15,2],3),
        round(coef(summary(e_D_INTERACT_LOG))[15,2],3),
        round(coef(summary(e_D_CLOSE_LOG))[15,2],3),
        round(coef(summary(e_D_FAM_LOG))[15,2],3)),
  
  rbind(NA,
        round(coef(summary(e_D_TOTAL_LOG))[15,4],3),
        round(coef(summary(e_D_TOTAL_OH_LOG))[15,4],3),
        round(coef(summary(e_D_INTERACT_LOG))[15,4],3),
        round(coef(summary(e_D_CLOSE_LOG))[15,4],3),
        round(coef(summary(e_D_FAM_LOG))[15,4],3)))

round(table_3c_EN,3)


# KOREAN INTERVIEWS

k_null<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
               factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity), 
             subset(HLSK_bi, Language==0), family=quasibinomial)

k_D_TOTAL_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                      factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity)+
                      scale(D_TOTAL_LOG), 
                    subset(HLSK_bi, Language==0), family=quasibinomial)

k_D_FAM_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                    factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity)+
                    scale(D_FAM_LOG), 
                  subset(HLSK_bi, Language==0), family=quasibinomial)

k_D_TOTAL_OH_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                         factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity)+
                         scale(D_TOTAL_OH_LOG), 
                       subset(HLSK_bi, Language==0), family=quasibinomial)

k_D_INTERACT_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                         factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity)+
                         scale(D_INTERACT_LOG), 
                       subset(HLSK_bi, Language==0), family=quasibinomial)

k_D_CLOSE_LOG<-glm (cbind(SUCC,FAIL)~factor(Agecat)+factor(Male)+factor(Married)+
                      factor(Educ)+factor(Employed)+factor(IncomeMT50)+factor(Site)+factor(Identity)+
                      scale(D_CLOSE_LOG), 
                    subset(HLSK_bi, Language==0), family=quasibinomial)

summary(k_null)

table_3c_KR<-cbind(
  rbind(k_null$deviance,
        k_D_TOTAL_LOG$deviance-k_null$deviance,
        k_D_TOTAL_OH_LOG$deviance-k_null$deviance,
        k_D_INTERACT_LOG$deviance-k_null$deviance,
        k_D_CLOSE_LOG$deviance-k_null$deviance,
        k_D_FAM_LOG$deviance-k_null$deviance), 
  
  rbind(NA,
        round(coef(summary(k_D_TOTAL_LOG))[15,1],3),
        round(coef(summary(k_D_TOTAL_OH_LOG))[15,1],3),
        round(coef(summary(k_D_INTERACT_LOG))[15,1],3),
        round(coef(summary(k_D_CLOSE_LOG))[15,1],3),
        round(coef(summary(k_D_FAM_LOG))[15,1],3)),
  
  rbind(NA,
        round(coef(summary(k_D_TOTAL_LOG))[15,2],3),
        round(coef(summary(k_D_TOTAL_OH_LOG))[15,2],3),
        round(coef(summary(k_D_INTERACT_LOG))[15,2],3),
        round(coef(summary(k_D_CLOSE_LOG))[15,2],3),
        round(coef(summary(k_D_FAM_LOG))[15,2],3)),
  
  rbind(NA,
        round(coef(summary(k_D_TOTAL_LOG))[15,4],3),
        round(coef(summary(k_D_TOTAL_OH_LOG))[15,4],3),
        round(coef(summary(k_D_INTERACT_LOG))[15,4],3),
        round(coef(summary(k_D_CLOSE_LOG))[15,4],3),
        round(coef(summary(k_D_FAM_LOG))[15,4],3)))

round(table_3c_KR,3)
 
  

###############################################################
# ANALYSIS STEP 4:                                            #
# Population Inference                                        #
###############################################################

###############################################################
# TABLE 4 
###############################################################


# ACS 

dim(ACS)

dACS= svrepdesign(data=ACS, 
                  weights=ACS$PWGTP, 
                  repweights=ACS[,31:110], 
                  type="BRR", combined.weights=TRUE) 

table_4_acs<-round(svymean(~Age18_29+Male+Married+Educ_COLL+IncomeMT50+Employed,
                    dACS,na.rm=T),3)
table_4_acs

# HLSK

HLSK$RECRUITER_ID <- as.character(Find_Rec_ID(HLSK)) 

# Recode SEEDs RECRUITER number to 1
HLSK$RECRUITER_ID[HLSK$SEED==1]<--1 
HLSK$RECRUITER_ID[is.na(HLSK$RECRUITER_ID)==T]<--1


HLSK_R_TOTAL <- as.rds.data.frame(HLSK, 
                                  id = "UNIQUE_NUM", 
                                  recruiter.id = "RECRUITER_ID",
                                  network.size = "D_TOTAL_WOOUT", 
                                  population.size = 1000000, 
                                  max.coupons = 2)

HLSK_R_TOTAL_OH <- as.rds.data.frame(HLSK, 
                                     id = "UNIQUE_NUM", 
                                     recruiter.id = "RECRUITER_ID",
                                     network.size = "D_TOTAL_OH_WOOUT", 
                                     population.size = 1000000, 
                                     max.coupons = 2)

HLSK_R_INTERACT <- as.rds.data.frame(HLSK, 
                                     id = "UNIQUE_NUM", 
                                     recruiter.id = "RECRUITER_ID",
                                     network.size = "D_INTERACT_WOOUT", 
                                     population.size = 1000000, 
                                     max.coupons = 2)

HLSK_R_CLOSE <- as.rds.data.frame(HLSK, 
                                  id = "UNIQUE_NUM", 
                                  recruiter.id = "RECRUITER_ID",
                                  network.size = "D_CLOSE_WOOUT", 
                                  population.size = 1000000, 
                                  max.coupons = 2)

HLSK_R_FAM <- as.rds.data.frame(HLSK, 
                                id = "UNIQUE_NUM", 
                                recruiter.id = "RECRUITER_ID",
                                network.size = "D_FAM_WOOUT", 
                                population.size = 1000000, 
                                max.coupons = 2)


varlist<-c("Age18_29","Male","Married","Edu_COLL","IncomeMT50","Employed")
 
#Note to users: "RDS.bootstrap.intervals" takes a bit of time 

set.seed(1234)

TOTAL_rds2_bs_mean = c() 
TOTAL_rds2_bs_se = c() 

for(v in 1:length(varlist)){
  temp<-RDS.bootstrap.intervals(HLSK_R_TOTAL,outcome.variable=varlist[v],
                                weight.type="RDS-II",
                                number.of.bootstrap.samples=500,
                                N=158242)
  TOTAL_rds2_bs_mean[v] = ifelse(is.null(temp),0,temp[[2]][1])
  TOTAL_rds2_bs_se[v] = ifelse(is.null(temp),0,temp[[2]][5])
}

TOTAL_OH_rds2_bs_mean = c() 
TOTAL_OH_rds2_bs_se = c() 

for(v in 1:length(varlist)){
  temp<-RDS.bootstrap.intervals(HLSK_R_TOTAL_OH,outcome.variable=varlist[v],
                                weight.type="RDS-II",
                                number.of.bootstrap.samples=500,
                                N=158242)
  TOTAL_OH_rds2_bs_mean[v] = ifelse(is.null(temp),0,temp[[2]][1])
  TOTAL_OH_rds2_bs_se[v] = ifelse(is.null(temp),0,temp[[2]][5])
}


INTERACT_rds2_bs_mean = c() 
INTERACT_rds2_bs_se = c() 

for(v in 1:length(varlist)){
  temp<-RDS.bootstrap.intervals(HLSK_R_INTERACT,outcome.variable=varlist[v],
                                weight.type="RDS-II",
                                number.of.bootstrap.samples=500,
                                N=158242)
  INTERACT_rds2_bs_mean[v] = ifelse(is.null(temp),0,temp[[2]][1])
  INTERACT_rds2_bs_se[v] = ifelse(is.null(temp),0,temp[[2]][5])
}


CLOSE_rds2_bs_mean = c() 
CLOSE_rds2_bs_se = c() 

for(v in 1:length(varlist)){
  temp<-RDS.bootstrap.intervals(HLSK_R_CLOSE,outcome.variable=varlist[v],
                                weight.type="RDS-II",
                                number.of.bootstrap.samples=500,
                                N=158242)
  CLOSE_rds2_bs_mean[v] = ifelse(is.null(temp),0,temp[[2]][1])
  CLOSE_rds2_bs_se[v] = ifelse(is.null(temp),0,temp[[2]][5])
}



FAM_rds2_bs_mean = c() 
FAM_rds2_bs_se = c() 

for(v in 1:length(varlist)){
  temp<-RDS.bootstrap.intervals(HLSK_R_FAM,outcome.variable=varlist[v],
                                weight.type="RDS-II",
                                number.of.bootstrap.samples=500,
                                N=158242)
  FAM_rds2_bs_mean[v] = ifelse(is.null(temp),0,temp[[2]][1])
  FAM_rds2_bs_se[v] = ifelse(is.null(temp),0,temp[[2]][5])
}


Uwt_mean<-HLSK%>%
  select(varlist)%>%
  summarise(across(where(is.numeric), ~ mn(.x)))

Uwt_se<-HLSK%>%
  select(varlist)%>%
  summarise(across(where(is.numeric), ~ se(.x)))

table_4_hlsk<-round(cbind(rbind(Uwt_mean,
                TOTAL_rds2_bs_mean,
                TOTAL_OH_rds2_bs_mean,
                INTERACT_rds2_bs_mean,
                CLOSE_rds2_bs_mean,
                FAM_rds2_bs_mean),
                rbind(Uwt_se,
                TOTAL_rds2_bs_se,
                TOTAL_OH_rds2_bs_se,
                INTERACT_rds2_bs_se,
                CLOSE_rds2_bs_se,
                FAM_rds2_bs_se)),3)

table_4_hlsk



###############################################################
# APPENDIX 1                                                  #
###############################################################

options(scipen = 999)
 
D_TOTAL_h<-ggplot(HLSK, aes(x=D_TOTAL)) + 
  geom_histogram(color="navy", fill="navy", binwidth=1000)+ 
  ylim(0, 100)+
  geom_vline(aes(xintercept=mean(D_TOTAL, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_TOTAL, na.rm=T)+40000, y =90,
                label = paste0("Mean: ", round(mean(D_TOTAL, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL, na.rm=T)+40000, y=80,
                label=paste0("SD: ", round(sd(D_TOTAL, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL, na.rm=T)+40000, y=70,
                label=paste0("CV: ", round(sd(D_TOTAL, na.rm=T)/
                                             mean(D_TOTAL, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL, na.rm=T)+40000, y=60,
                label=paste0("Median: ", round(median(D_TOTAL, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL, na.rm=T)+40000, y=50,
                label=paste0("Maximum: ", round(max(D_TOTAL, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL, na.rm=T)+40000, y=40,
                label="98th Percentile: 500",
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Total\nwith Outliers\n", y="Sample Size")+
  scale_x_continuous(labels = scales::scientific)

D_TOTAL_OH_h<-ggplot(HLSK, aes(x=D_TOTAL_OH)) + 
  geom_histogram(color="navy", fill="navy", binwidth=3000)+ 
  ylim(0, 100)+
  geom_vline(aes(xintercept=mean(D_TOTAL_OH, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_TOTAL_OH, na.rm=T)+400000, y =90,
                label = paste0("Mean: ", round(mean(D_TOTAL_OH, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_OH, na.rm=T)+400000, y=80,
                label=paste0("SD: ", round(sd(D_TOTAL_OH, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_OH, na.rm=T)+400000, y=70,
                label=paste0("CV: ", round(sd(D_TOTAL_OH, na.rm=T)/
                                             mean(D_TOTAL_OH, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_OH, na.rm=T)+400000, y=60,
                label=paste0("Median: ", round(median(D_TOTAL_OH, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_OH, na.rm=T)+400000, y=50,
                label=paste0("Maximum: ", round(max(D_TOTAL_OH, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_TOTAL_OH, na.rm=T)+400000, y=40,
                label="98th Percentile: 31",
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Total Scale-Up\nLast Name Oh\nwith Outliers", y="Sample Size")+
  scale_x_continuous(labels = scales::scientific)

D_INTERACT_h<-ggplot(HLSK, aes(x=D_INTERACT)) + 
  geom_histogram(color="navy", fill="navy", binwidth=1000)+ 
  ylim(0, 100)+
  geom_vline(aes(xintercept=mean(D_INTERACT, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_INTERACT, na.rm=T)+25000, y =90,
                label = paste0("Mean: ", round(mean(D_INTERACT, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_INTERACT, na.rm=T)+25000, y=80,
                label=paste0("SD: ", round(sd(D_INTERACT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_INTERACT, na.rm=T)+25000, y=70,
                label=paste0("CV: ", round(sd(D_INTERACT, na.rm=T)/
                                             mean(D_INTERACT, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_INTERACT, na.rm=T)+25000, y=60,
                label=paste0("Median: ", round(median(D_INTERACT, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_INTERACT, na.rm=T)+25000, y=50,
                label=paste0("Maximum: ", round(max(D_INTERACT, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_INTERACT, na.rm=T)+25000, y=40,
                label="98th Percentile: 50",
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Interact\n>Once/Week\nwith Outliers", y="Sample Size")+
  scale_x_continuous(labels = scales::scientific,
                     breaks=seq(0,250000,100000))

D_CLOSE_h<-ggplot(HLSK, aes(x=D_CLOSE)) + 
  geom_histogram(color="navy", fill="navy", binwidth=1)+ 
  ylim(0, 100)+
  geom_vline(aes(xintercept=mean(D_CLOSE, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_CLOSE, na.rm=T)+10, y =90,
                label = paste0("Mean: ", round(mean(D_CLOSE, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_CLOSE, na.rm=T)+10, y=80,
                label=paste0("SD: ", round(sd(D_CLOSE, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_CLOSE, na.rm=T)+10, y=70,
                label=paste0("CV: ", round(sd(D_CLOSE, na.rm=T)/
                                             mean(D_CLOSE, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_CLOSE, na.rm=T)+10, y=60,
                label=paste0("Median: ", round(median(D_CLOSE, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_CLOSE, na.rm=T)+10, y=50,
                label=paste0("Maximum: ", round(max(D_CLOSE, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_CLOSE, na.rm=T)+10, y=40,
                label="98th Percentile: 20",
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Close\nRelationship\nwith Outliers", y="Sample Size")+
  scale_x_continuous()


D_FAM_h<-ggplot(HLSK, aes(x=D_FAM)) + 
  geom_histogram(color="navy", fill="navy", binwidth=1000)+ 
  ylim(0, 100)+
  geom_vline(aes(xintercept=mean(D_FAM, na.rm=T)),
             color="red", linetype="dashed", size=1)+
  geom_text(aes(x = mean(D_FAM, na.rm=T)+20000, y =90,
                label = paste0("Mean: ", round(mean(D_FAM, na.rm=T), 2))),
            size = 4, color="black", fontface="italic",
            hjust = 0)+
  geom_text(aes(x = mean(D_FAM, na.rm=T)+20000, y=80,
                label=paste0("SD: ", round(sd(D_FAM, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_FAM, na.rm=T)+20000, y=70,
                label=paste0("CV: ", round(sd(D_FAM, na.rm=T)/
                                             mean(D_FAM, na.rm=T),2)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_FAM, na.rm=T)+20000, y=60,
                label=paste0("Median: ", round(median(D_FAM, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_FAM, na.rm=T)+20000, y=50,
                label=paste0("Maximum: ", round(max(D_FAM, na.rm=T),1)),
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  geom_text(aes(x = mean(D_FAM, na.rm=T)+20000, y=40,
                label="98th Percentile: 19",
                fontface ="italic"), 
            size = 4, color="black",
            hjust = 0)+
  labs(x="Degree: Family\nwith Outliers\n", y="Sample Size")+
  scale_x_continuous(labels = scales::scientific,
                     breaks=seq(0,100000,30000))


figAppendix<-ggarrange(D_TOTAL_h, D_TOTAL_OH_h, blank,
                       D_INTERACT_h, D_CLOSE_h,  
                       D_FAM_h,
                       ncol=3, nrow=2,
                       common.legend = TRUE)

figAppendix

#Saved as a pdf for a size7.5 x 5