clear all
cap log close

********************************************************************************
**# 0) Set-up ******************************************************************
********************************************************************************

*** define paths
cd ""
global datadir =""
global dodir =""
global logdir ="" 
global resultsdir =""

*** load dataset (available from the Gesis archive under https://doi.org/10.7802/2652)
use ${datadir}\SDCCP1_v1-0-0.dta, clear 


********************************************************************************
**# 1) Preparing variables / dataset for analysis ******************************
********************************************************************************

*** basic variables ************************************************************
*** mode
egen modecount = anycount(v47_1 v47_2 v47_3 v47_4), values(1 2 3 4)
gen mode=.
replace mode=1 if v47_1==1 & v47_2==0 & v47_3==0 & v47_4==0
replace mode=2 if v47_1==0 & v47_2==1 & v47_3==0 & v47_4==0
replace mode=3 if v47_1==0 & v47_2==0 & v47_3==1 & v47_4==0
replace mode=4 if v47_1==0 & v47_2==0 & v47_3==0 & v47_4==1
replace mode=5 if modecount>1
recode mode (. = -9)

label variable mode "Survey mode"
label define mode 1 "CAPI" 2 "CATI" 3 "CAWI" 4 "PAPI" 5 "mixed-mode" -9 "no information", replace
label value mode mode
numlabel mode, add
tab mode, m

drop modecount


*** nonprob
gen nonprob = v28 == 2 if v28 != -9
recode nonprob (. = -9)
label variable nonprob "survey is nonprob."
label value nonprob yesno

tab nonprob, m


*** field_start_my
gen field_start_my = ym(v63,v62)
recode field_start_my (720/721 = 722) //3 surveys started before March 2020 (special cases), fieldwork start date recoded to March for the analysis

*labels
#delimit ;

label define months  
722 "Mar. 20"
723 "Apr. 20" 
724 "May 20" 
725 "June 20" 
726 "July 20" 
727 "Aug. 20" 
728 "Sept. 20" 
729 "Oct. 20" 
730 "Nov. 20" 
731 "Dec. 20" 
732 "Jan. 21" 
733 "Feb. 21" 
734 "Mar. 21" 
735 "Apr. 21" 
736 "May 21" 
737 "June 21" 
738 "July 21" 
739 "Aug. 21" 
740 "Sept. 21" 
741 "Oct. 21" 
742 "Nov. 21"
743 "Dec. 21"  
;
#delimit cr

label values field_start_my months 

tab field_start_my, m

*** field_start_my_num - numbered variable with 03/20 = 1
gen field_start_my_num = field_start_my - 721
label values field_start_my_num months

tab field_start_my_num, m

*** number of fieldwork starts per month
bysort field_start_my: egen fsmy_count=count(rowid)
label variable fsmy_count "Total"


*** survey design dimension ****************************************************
*** desgroups - combinations of survey design (sampling (prob./nonprob.) + mode)
gen desgroups =.
replace desgroups = 1 if nonprob == 0 & mode==2
replace desgroups = 2 if nonprob == 0 & mode==3
replace desgroups = 3 if nonprob == 0 & mode==5
replace desgroups = 4 if nonprob == 0 & mode==1 | nonprob == 0 & mode==4
replace desgroups = 5 if nonprob == 1 & mode==3
replace desgroups = 6 if nonprob == 1 & mode==5
label define desgroups_lbl 1 "Prob./CATI" 2 "Prob./CAWI" 3 "Prob./mixed" 4 "Prob./CAPI|PAPI" 5 "Nonprob./CAWI" 6 "Nonprob./mixed"
label values desgroups desgroups_lbl
numlabel desgroups_lbl, add
tab desgroups, m


*** interpretability dimension *************************************************
*** v25_int (information on the target population)
gen v25_int=0 if regexm(v25, "-9") == 1 
replace v25_int=1 if v25_int!=0
replace v25_int=1 if rowid==46 | rowid==47 /* manual correction */
tab v25_int


*** v39_int (information on the concrete sampling procedure)
gen v39_int=0 if v39_1==-9 /* missing information on the concrete sampling procedure means -9 in v39_1 (and other categories v39_* as well) */
replace v39_int=1 if v39_int!=0
tab v39_int v39_1


*** v68_int (information on the sample size)
gen v68_int=0 if v68==-9
replace v68_int=1 if v68_int!=0
tab v68 v68_int


*** field_full_int (information on the date of data collection)
gen field_full_int=0 if v61==-9 | v62==-9 | v63==-9 | v64==-9 | v65==-9 | v66==-9 
replace field_full_int=1 if field_full_int!=0
tab field_full_int 


*** anyrate_int (information on any outcome rate (0 if no rates coded at all))
gen anyrate_int=.
replace anyrate_int=1 if v70>0 | v72>0 | v74>0 | v76>0 | v78>0
replace anyrate_int=0 if v70<0 & v72<0 & v74<0 & v76<0 & v78<0
tab anyrate_int, m


*** full interpretability index (5 variables)
gen ind_int=.
replace ind_int = v25_int+v39_int+v68_+field_full_int+anyrate_int
label variable ind_int "Interpretability"
tab ind_int


*** restricted interpretability index (4 variables without any_rate, for robustness checks)
gen ind_int_4=.
replace ind_int_4 = v25_int+v39_int+v68_+field_full_int
label variable ind_int_4 "Interpretability - without outcome rate"
tab ind_int_4


*** accessibility dimension ****************************************************

*** first results available
gen results_avail = v93 == 1

*** month/year of first results publication 
gen results_my = ym(v96,v95)
tab results_my, m

*** time elapsed until results publication (in months); not published until coding = 999
gen results_timeelapse = results_my - field_start_my
replace results_timeelapse = 999 if results_avail == 0 
tab results_timeelapse

*** data available
gen data_avail = v86 == 1

*** month/year of first data publication
gen data_my = ym(v92,v91)
tab data_my, m

*** time elapsed until data publication (in months); not published until coding = 999
gen data_timeelapse = data_my - field_start_my
replace data_timeelapse = 999 if data_avail == 0 
tab data_timeelapse


*** drop missings and save dataset *********************************************
* observations with missing information on at least one of the following variables excluded: fieldwork start date, sampling procedure (prob.x nonprob.), mode  

drop if field_start_my==. /* 4 obs. dropped */
drop if desgroups==. | desgroups==6 /* 27 obs. dropped */


*** save the dataset for analysis
save ${datadir}\SDCCP_AP1_analytical.dta, replace



********************************************************************************
**# 2) Building of further (EHA) datasets (accessibility of first results / data)
********************************************************************************

*** load and snapshot the analytical dataset ***********************************
use ${datadir}\SDCCP_AP1_analytical.dta
snapshot erase _all
snapshot save, label(SDCCP_AP1_analytical)


********************************************************************************
**# 2a) Building the EHA datasets for assessing accessibility of results********
********************************************************************************
* EHA = event history analysis 

*** Restrict dataset to useable observations ***********************************
keep if results_my < . |  results_avail == 0 //keep if A) date of results publication is known or B) no results were available at all. 
tab v1 //N = 483 

*** snapshot of this datafile **************************************************
snapshot save, label(analytical_access_results)

*** Building the EHA dataset ***************************************************

*** independent variables - Appendix Table A1
tabstat (field_start_my_num), statistics(n mean sd)

tab desgroups

*** Expand dataset (i.e. creation of a discrete event history dataset format)***
gen last_obs = 760
gen last_obs_str = string(last_obs, "%tm") //data right-cesonderd from Mai 2023 onwards
gen mdate_diff = last_obs - field_start_my

* Duplicate each observation with the number of times stored in mdate_diff 
replace mdate_diff = mdate_diff + 1 
expand mdate_diff, generate(n_expanded)
sort rowid mdate_diff

* Creating a variable that counts through the number of lines per ID (rowid)
bysort rowid: gen seq_number = _n

* same variable minus 1 (as this variable is used to compute the monthly date-variable in a next step)
replace seq_number = seq_number - 1

* computation of monthly date-variable 
gen mdate = field_start_my + seq_number
*> now every survey's first row represents the month of fieldwork start and it's last row represents the month 760 (i.e. Mai 2023)

*compute event indicator
gen event = results_my == mdate 

*censoring after event
bysort rowid (mdate): gen helpvar1 = sum(event)
bysort rowid (mdate): gen helpvar2 = sum(helpvar1)
tab n_expanded if helpvar2 >=2
drop if helpvar2 >=2
drop helpvar1 helpvar2

*** Creating Additional Variables **********************************************

*** month since fieldwork start (i.e. process time)
*month since fieldwork start - Various specifications
gen msfs = seq_number 		//basic
gen msfs_log = log(msfs+1) 	//log of
*mean-centering of msfs
summ msfs
gen msfs_c = msfs - `r(mean)'
*Dummy coding
gen msfs_k1 = msfs
recode msfs_k1 (24/99=24)

*** month of fieldwork start (i.e. period / calendar-time indicator)
*Transformation of field_start_my (1 = March 2020)
replace field_start_my = field_start_my - 721
*mean-centering of field_start_my
summ field_start_my
gen fs_myc = field_start_my - `r(mean)'	

*** snapshot of this datafile **************************************************
snapshot save, label(EHA_access_results)


********************************************************************************
**# 2b) Building the EHA datasets for assessing accessibility of data***********
********************************************************************************

snapshot restore 1

*** Restrict dataset to useable observations ***********************************
keep if data_my < . |  data_avail == 0 //keep if A) date of data publication is known or B) no data were available at all. 
tab v1 //N = 590

*** snapshot of this datafile **************************************************
snapshot save, label(analytical_access_data)

*** Building the EHA dataset ***************************************************

*** independent variables - Appendix Table A1
tabstat (field_start_my_num), statistics(n mean sd)

tab desgroups

*** Expand dataset (i.e. creation of a discrete event history dataset format)***
gen last_obs = 760
gen last_obs_str = string(last_obs, "%tm") //data right-cesonderd from Mai 2023 onwards
gen mdate_diff = last_obs - field_start_my

* Duplicate each observation with the number of times stored in mdate_diff 
replace mdate_diff = mdate_diff + 1 
expand mdate_diff, generate(n_expanded)
sort rowid mdate_diff

* Creating a variable that counts through the number of lines per ID (rowid)
bysort rowid: gen seq_number = _n

* same variable minus 1 (as this variable is used to compute the monthly date-variable in a next step)
replace seq_number = seq_number - 1

* computation of monthly date-variable 
gen mdate = field_start_my + seq_number
*> now every survey's first row represents the month of fieldwork start and it's last row represents the month 760 (i.e. Mai 2023)

*compute event indicator
gen event = data_my == mdate 

*censoring after event
bysort rowid (mdate): gen helpvar1 = sum(event)
bysort rowid (mdate): gen helpvar2 = sum(helpvar1)
tab n_expanded if helpvar2 >=2
drop if helpvar2 >=2
drop helpvar1 helpvar2


*** Creating Additional Variables **********************************************

*** month since fieldwork start (i.e. process time)
*month since fieldwork start - Various specifications
gen msfs = seq_number 		//basic
gen msfs_log = log(msfs+1) 	//log of
*mean-centering of msfs
summ msfs
gen msfs_c = msfs - `r(mean)'
*Dummy coding
gen msfs_k1 = msfs
recode msfs_k1 (24/999=24)

*** month of fieldwork start (i.e. period / calendar-time indicator)
*Transformation of field_start_my (1 = March 2020)
replace field_start_my = field_start_my - 721
*mean-centering of field_start_my
summ field_start_my
gen fs_myc = field_start_my - `r(mean)'

*** snapshot of this datafile **************************************************
snapshot save, label(EHA_access_data)


********************************************************************************
**# 2c) Overview snapshots
********************************************************************************

snapshot list _all

********************************************************************************
**# 2d) Dataset description / overview
********************************************************************************

*** independent variables - Appendix Table A1

*DS for survey design and interpretability
snapshot restore 1
tabstat (field_start_my_num), statistics(n mean sd)
tab desgroups

*DS for accessibility first results
snapshot restore 2
tabstat (field_start_my_num), statistics(n mean sd)
tab desgroups

*DS for accessibility data
snapshot restore 4
tabstat (field_start_my_num), statistics(n mean sd)
tab desgroups


*** describe the main analytical sample - survey_per_program 
preserve
snapshot restore 1
sort rowid v2 

egen tag = tag(v2)   // Tags only the first occurrence of each unique value of v2
count if tag == 1    // Counts the unique values in v2 - 183

bysort v2: gen survey_per_program = _N if _n == 1
tab survey_per_program
sum survey_per_program, detail
restore



********************************************************************************
**# 3) Results - Survey quality during the pandemic ****************************
********************************************************************************
snapshot restore 1

*** survey design **************************************************************
tab nonprob
tab desgroups

*** interpretability ***********************************************************
sum ind_int, detail 
tab ind_int

**# Figure 1 - Accessibility of results and data after fieldwork start *********
/*Notes:
- 	Computation of fitted survival probability (surv_v1) based on fitted hazard (haz_v1) as described in:
	Singer, J. D., & Willett, J. B. (2003). Applied longitudinal data analysis: Modeling change and event occurrence. Oxford university press, from there:
	Chapter 12: Extending the Discrete-time Hazard Model | Stata Textbook Examples (Syntax for Figure 12.1)
- 	cumulative probability of event occurrence (kumprob) is computed as the inverse of surv_v1 1-surv_v1 an plotted in Figure 1
- 	The cumulative probability of event occurrence is also known as the "cumulative distribution function" 
*/

*** Figure 1 - results by desgroups ********************************************
*** number of results pulications per design group per month 
snapshot restore 3
sort msfs desgroups
by msfs desgroups: egen num_event=count(event) if event==1

table msfs desgroups, stat(count num_event)

*** overall
snapshot restore 3

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_results_0
save ${datadir}\kp_results_desgroups.dta, replace


*** desgroups==1
snapshot restore 3
keep if desgroups==1

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_results_1
merge 1:1 msfs using ${datadir}\kp_results_desgroups.dta
drop _merge 
save ${datadir}\kp_results_desgroups.dta, replace


*** desgroups==2
snapshot restore 3
keep if desgroups==2

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_results_2
merge 1:1 msfs using ${datadir}\kp_results_desgroups.dta
drop _merge 
save ${datadir}\kp_results_desgroups.dta, replace


*** desgroups==3
snapshot restore 3
keep if desgroups==3

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_results_3
merge 1:1 msfs using ${datadir}\kp_results_desgroups.dta
drop _merge 
save ${datadir}\kp_results_desgroups.dta, replace


*** desgroups==4
snapshot restore 3
keep if desgroups==4

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_results_4
merge 1:1 msfs using ${datadir}\kp_results_desgroups.dta
drop _merge 
save ${datadir}\kp_results_desgroups.dta, replace


*** desgroups==5
snapshot restore 3
keep if desgroups==5

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_results_5
merge 1:1 msfs using ${datadir}\kp_results_desgroups.dta
drop _merge 
save ${datadir}\kp_results_desgroups.dta, replace


*** figure 
use ${datadir}\kp_results_desgroups.dta

graph twoway 	(line kumprob_results_0 msfs, lpattern(solid) lwidth (medthick) color(black)) ///
				(line kumprob_results_1 msfs, lpattern(solid) lwidth (medthick) color(navy)) ///
				(line kumprob_results_2 msfs, lpattern(solid) lwidth (medthick) color(purple)) ///
				(line kumprob_results_3 msfs, lpattern(solid) lwidth (medthick) color(gold)) ///
				(line kumprob_results_4 msfs, lpattern(solid) lwidth (medthick) color(green)) ///
				(line kumprob_results_5 msfs, lpattern(solid) lwidth (medthick) color(orange)), ///
				xtitle("Months since fieldwork start") ///
				ytitle("Cumulative probability") ///
				xtick(0(1)38) ///
				xlabel(0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38, valuelabel labsize(small)) ///
				ylabel(0.1(0.1)1.0, grid format(%9.2f) gmin angle(horizontal)) ///
				scheme(s1mono) ///
				legend(lab(1 "Total") lab(2 "Prob./CATI") lab(3 "Prob./CAWI") lab(4 "Prob./mixed") lab(5 "Prob./CAPI|PAPI") /// 
				lab(6 "Nonprob./CAWI") pos(6) ring(1) region(lstyle(none))) ///
				title("a) Accessibility of results (N = 483)") saving(figure1_results_bydesgroups_with_total)
				
graph export ${resultsdir}\figure1_results_bydesgroups_with_total.emf, replace
graph export ${resultsdir}\figure1_results_bydesgroups_with_total.eps, replace


*** Figure 1 - data by desgroups ***********************************************
*** overall
snapshot restore 5

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_data_0
save ${datadir}\kp_data_desgroups.dta, replace


*** desgroups==1
snapshot restore 5
keep if desgroups==1

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_data_1
merge 1:1 msfs using ${datadir}\kp_data_desgroups.dta
drop _merge 
save ${datadir}\kp_data_desgroups.dta, replace


*** desgroups==2
snapshot restore 5
keep if desgroups==2

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_data_2
merge 1:1 msfs using ${datadir}\kp_data_desgroups.dta
drop _merge 
save ${datadir}\kp_data_desgroups.dta, replace


*** desgroups==3
snapshot restore 5
keep if desgroups==3

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_data_3
merge 1:1 msfs using ${datadir}\kp_data_desgroups.dta
drop _merge 
save ${datadir}\kp_data_desgroups.dta, replace


*** desgroups==4
snapshot restore 5
keep if desgroups==4

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_data_4
merge 1:1 msfs using ${datadir}\kp_data_desgroups.dta
drop _merge 
save ${datadir}\kp_data_desgroups.dta, replace


*** desgroups==5
snapshot restore 5
keep if desgroups==5

*Compute (and plot) the fitted hazard rate
capture drop ha*
capture drop sur*
sort msfs
by msfs: egen haz_v1 = mean(event)

*Compute (and plot) the fitted survival probability
sort rowid msfs
order rowid, before (msfs)
gen surv_v1 = 1
replace surv_v1 = (1-haz_v1)*surv_v1 if msfs == 0
replace surv_v1 = (1-haz_v1)*surv_v1[_n-1] if msfs > 0
sort msfs

*Compute and plot the cumulative probability of event occurence (shown in Figure 1)
gen kumprob = 1-surv_v1

sort msfs
gen h1 = msfs != msfs[_n-1]
keep if h1 == 1
drop h1
keep msfs kumprob
rename kumprob kumprob_data_5
merge 1:1 msfs using ${datadir}\kp_data_desgroups.dta
drop _merge 
save ${datadir}\kp_data_desgroups.dta, replace


*** figure 
use ${datadir}\kp_data_desgroups.dta

graph twoway 	(line kumprob_data_0 msfs, lpattern(solid) lwidth (medthick) color(black)) ///
				(line kumprob_data_1 msfs, lpattern(solid) lwidth (medthick) color(navy)) ///
				(line kumprob_data_2 msfs, lpattern(solid) lwidth (medthick) color(purple)) ///
				(line kumprob_data_3 msfs, lpattern(solid) lwidth (medthick) color(gold)) ///
				(line kumprob_data_4 msfs, lpattern(solid) lwidth (medthick) color(green)) ///
				(line kumprob_data_5 msfs, lpattern(solid) lwidth (medthick) color(orange)), ///
				xtitle("Months since fieldwork start") ///
				ytitle("Cumulative probability") ///
				xtick(0(1)38) ///
				xlabel(0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38, valuelabel labsize(small)) ///
				ylabel(0.1(0.1)1.0, grid format(%9.2f) gmin angle(horizontal)) ///
				scheme(s1mono) ///
				legend(lab(1 "Total") lab(2 "Prob./CATI") lab(3 "Prob./CAWI") lab(4 "Prob./mixed") lab(5 "Prob./CAPI|PAPI") /// 
				lab(6 "Nonprob./CAWI") pos(6) ring(1) region(lstyle(none))) ///
				title("b) Accessibility of data (N = 590)") saving(figure1_data_bydesgroups_with_total)

graph export ${resultsdir}\figure1_data_bydesgroups_with_total.emf, replace
graph export ${resultsdir}\figure1_data_bydesgroups_with_total.eps, replace


*** Figure 1 - combined - results and data by desgroups ************************

graph combine figure1_results_bydesgroups_with_total.gph figure1_data_bydesgroups_with_total.gph, scale(0.8)
graph export ${resultsdir}\figure1_combo.emf, replace
graph export ${resultsdir}\figure1_combo.eps, replace



**# Figure 2 - Number of surveys over time *************************************
snapshot restore 1

preserve

levelsof desgroups, local(groups)
levelsof field_start_my, local(months)

foreach group in `groups' {
    gen fsmy_count_`group' = 0
    label variable fsmy_count_`group' "Count for `group'"
    foreach month in `months' {
        quietly count if desgroups == `group' & field_start_my == `month'
        replace fsmy_count_`group' = r(N) if field_start_my == `month'
    }
}

twoway (line fsmy_count field_start_my, lwidth(medthick) color(black)) ///
(line fsmy_count_1 field_start_my, lwidth(medthick) color(navy)) ///
(line fsmy_count_2 field_start_my, lwidth(medthick) color(purple)) ///
(line fsmy_count_3 field_start_my, lwidth(medthick) color(gold)) ///
(line fsmy_count_4 field_start_my, lwidth(medthick) color(green)) ///
(line fsmy_count_5 field_start_my, lwidth(medthick) color(orange)), ///
xlabel(722(3)743, valuelabel angle(45)) scheme(s1mono) ///
xtitle("Month of fieldwork start") ytitle("") ylabel(0(10)80, angle(0)) ///
legend(lab(1 "Total") lab(2 "Prob./CATI") lab(3 "Prob./CAWI") lab(4 "Prob./mixed") ///
lab(5 "Prob./CAPI|PAPI") lab(6 "Nonprob./CAWI") pos(6) ring(1) row(2) region(lstyle(none)))

graph export ${resultsdir}\figure2.emf, replace
graph export ${resultsdir}\figure2.eps, replace
restore



**# Figure 3 - Quality of social science surveys over time *********************
snapshot restore 1

*** survey quality over time - survey design ***
*** additional variables for the figure
tab desgroups, gen(desgroup)
bysort field_start_my: egen desgroup1_count= sum(desgroup1)
bysort field_start_my: egen desgroup2_count= sum(desgroup2)
bysort field_start_my: egen desgroup3_count= sum(desgroup3)
bysort field_start_my: egen desgroup4_count= sum(desgroup4)
bysort field_start_my: egen desgroup5_count= sum(desgroup5)
label variable desgroup1_count "Prob./CATI"
label variable desgroup2_count "Prob./CAWI"
label variable desgroup3_count "Prob./mixed"
label variable desgroup4_count "Prob./CAPI|PAPI"
label variable desgroup5_count "Non-prob./CAWI"

gen desgroup_total=.
replace desgroup_total=desgroup1_count+desgroup2_count+desgroup3_count+desgroup4_count+desgroup5_count

gen desgroup1_pct=desgroup1_count/desgroup_total
gen desgroup2_pct=desgroup2_count/desgroup_total
gen desgroup3_pct=desgroup3_count/desgroup_total
gen desgroup4_pct=desgroup4_count/desgroup_total
gen desgroup5_pct=desgroup5_count/desgroup_total

gen zero = 0

gen pct1=desgroup1_count/desgroup_total
gen pct2=(desgroup1_count+desgroup2_count)/desgroup_total
gen pct3=(desgroup1_count+desgroup2_count+desgroup3_count)/desgroup_total
gen pct4=(desgroup1_count+desgroup2_count+desgroup3_count+desgroup4_count)/desgroup_total
gen pct5= 1

*** figure 3a)- distribution of survey designs over time 
twoway (rarea zero pct1 field_start_my, fcolor (navy) lcolor(black) lwidth(vthin)) ///
	(rarea pct1 pct2 field_start_my, fcolor (purple) lcolor(black) lwidth(vthin)) ///
	(rarea pct2 pct3 field_start_my, fcolor (gold) lcolor(black) lwidth(vthin)) ///
	(rarea pct3 pct4 field_start_my, fcolor (green) lcolor(black) lwidth(vthin))  ///
	(rarea pct4 pct5 field_start_my, fcolor (orange) lcolor(black) lwidth(vthin)), ///
	legend(order (1 "Prob./CATI" 2 "Prob./CAWI" 3 "Prob./mixed" 4 "Prob./CAPI|PAPI" 5 "Non-prob./CAWI") position(6) rows(1) stack ///
	region(lstyle(foreground)) size(small)) ///
	graphregion(fcolor(white)) ///
	xlabel(722(3)743, valuelabel) xtitle("Month of fieldwork start") ytitle("Relative frequency (%)") ///
	title("a) Survey design distribution (N = 686)") ///
	saving(fig3a)
graph export ${resultsdir}\figure3a.emf, replace


*** survey quality over time - interpretability ***
*** additional variables for the figure
sort field_start_my
by field_start_my: egen mind_int = mean(ind_int) if field_start_my <.

*** figure 3b)- mean interpretability over time
twoway  (line mind_int field_start_my, cmissing(no) lcolor(gs10) lpattern(solid))  ///
		(lowess mind_int field_start_my, cmissing(no) lcolor(gs0) lpattern(solid)), ///
		xtitle("Month of fieldwork start") ///
		ytitle("Mean interpretability") ///
		xlabel(722(3)743, valuelabel nogrid) ///
		ylabel(3.4(0.1)4.1, grid format(%9.1f) gmin angle(horizontal)) ///
		legend(lab(1 "observed") lab(2 "smoothed (LOWESS)") position(6) rows(1) region(lstyle(foreground))) ///
		graphregion(fcolor(white)) ///
		title("b) Interpretability (N = 686)") ///
		saving(fig3b)
graph export ${resultsdir}\figure3b.emf, replace


*** survey quality over time - accessibility of results ***
snapshot restore 2

*indicator: results publication within 2 months after fieldwork start
gen results_2m = results_timeelapse <= 2

sort field_start_my
by field_start_my: egen mresults_2m = mean(results_2m)
tab field_start_my results_2m, row 

*** figure 3c)- accessibility of results (proportion of surveys with results publication within 2 months after fieldwork start) over time 
twoway  (line mresults_2m field_start_my, cmissing(no) lcolor(gs10) lpattern(solid))  ///
		(lowess mresults_2m field_start_my, cmissing(no) lcolor(gs0) lpattern(solid)), ///
		xtitle("Month of fieldwork start") ///
		ytitle("Relative frequency (%)") ///
		xlabel(722(3)743, valuelabel nogrid) ///
		ylabel(0.2(0.1)0.7, grid format(%9.1f) gmin angle(horizontal)) ///
		legend(lab(1 "observed") lab(2 "smoothed (LOWESS)") position (6) rows(1) region(lstyle(foreground))) ///
		graphregion(fcolor(white)) ///
		title("c) Accessibility of results (N = 483)") ///
		saving(fig3c)
graph export ${resultsdir}\figure3c.emf, replace


*** survey quality over time - accessibility of data ***
snapshot restore 4

*indicator: data publication within 12 months after fieldwork start
gen data_12m = data_timeelapse <= 12

sort field_start_my
by field_start_my: egen mdata_12m = mean(data_12m)
tab field_start_my data_12m, row 

*** figure 3d)- accessibility of data (proportion of surveys with data publication within 12 months after fieldwork start) over time
capture erase fig3d.gph
twoway  (line mdata_12m field_start_my, cmissing(no) lcolor(gs10) lpattern(solid))  ///
		(lowess mdata_12m field_start_my, cmissing(no) lcolor(gs0) lpattern(solid)), ///
		xtitle("Month of fieldwork start") ///
		ytitle("Relative frequency (%)") ///
		xlabel(722(3)743, valuelabel nogrid) ///
		ylabel(0.2(0.1)0.7, grid format(%9.1f) gmin angle(horizontal)) ///
		legend(lab(1 "observed") lab(2 "smoothed (LOWESS)") position (6) rows(1) region(lstyle(foreground))) ///
		graphregion(fcolor(white)) ///
		title("d) Accessibility of data (N = 590)") ///
		saving(fig3d)
graph export ${resultsdir}\figure3d.emf, replace


***Combine graph ***
graph combine fig3a.gph fig3b.gph fig3c.gph fig3d.gph, scale(0.8)
graph export ${resultsdir}\figure3_combo.emf, replace
graph export ${resultsdir}\figure3.eps, replace



********************************************************************************
**# 4) Results - Associations between quality dimensions *********************** 
********************************************************************************

**# 4.1) interpretability *****************************************************
snapshot restore 1

*** model 1 - Table 2
mixed ind_int c.field_start_my##c.field_start_my || v2:, vce(cluster v2)
estat icc 
estat ic
mat s=r(S)
local ll = s[1,3]      // Log-Likelihood
local aic = s[1,5]    // AIC
local bic = s[1,6]    // BIC
display "Log-Likelihood: `ll'"
display "AIC: `aic'"
display "BIC: `bic'"

matrix b = e(b)
local id_var_col = colnumb(b, "lns1_1_1:_cons")
local id_variance = exp(b[1, `id_var_col'])^2
local res_var_col = colnumb(b, "lnsig_e:_cons")
local res_variance = exp(b[1, `res_var_col'])^2

display `id_variance', `res_variance'

outreg2 using "${resultsdir}/Table2_multi_interpretability.doc", replace label ctitle(Model 1) dec(3) addstat(Var_constant,"`id_variance'", Var_res_variance, "`res_variance'", Log-Likelihood, `ll', aic, `aic', bic, `bic')
estimates store model1

*** model 2 - Table 2 
mixed ind_int c.field_start_my##c.field_start_my ib5.desgroups || v2:, vce(cluster v2)
estat icc 
estat ic
mat s=r(S)
local ll = s[1,3]      // Log-Likelihood
local aic = s[1,5]    // AIC
local bic = s[1,6]    // BIC
display "Log-Likelihood: `ll'"
display "AIC: `aic'"
display "BIC: `bic'"

matrix b = e(b)
local id_var_col = colnumb(b, "lns1_1_1:_cons")
local id_variance = exp(b[1, `id_var_col'])^2
local res_var_col = colnumb(b, "lnsig_e:_cons")
local res_variance = exp(b[1, `res_var_col'])^2

display `id_variance', `res_variance'

outreg2 using "${resultsdir}/Table2_multi_interpretability.doc", label ctitle(Model 2) dec(3) append addstat(Var_constant,"`id_variance'", Var_res_variance, "`res_variance'", Log-Likelihood, `ll', aic, `aic', bic, `bic')
estimates store model2

*** pseudo R2 
* Step 1: Fit the null model (no predictors) and extract variance components 
mixed ind_int || v2:, vce(cluster v2)
matrix b = e(b)
local id_var_col_null = colnumb(b, "lns1_1_1:_cons")
local var_v2_null = exp(b[1, `id_var_col_null'])^2

local res_var_col_null = colnumb(b, "lnsig_e:_cons")
local var_residual_null = exp(b[1, `res_var_col_null'])^2

display `var_v2_null' `var_residual_null'

* Step 2: Fit the full model and extract variance components 
mixed ind_int c.field_start_my##c.field_start_my ib5.desgroups || v2:, vce(cluster v2)
matrix b = e(b)
local id_var_col_full = colnumb(b, "lns1_1_1:_cons")
local var_v2_full = exp(b[1, `id_var_col_full'])^2

local res_var_col_full = colnumb(b, "lnsig_e:_cons")
local var_residual_full = exp(b[1, `res_var_col_full'])^2

display `var_v2_full' `var_residual_full'

* * Step 3: Calculate pseudo-R2 and display results 
scalar R2_v2 = (`var_v2_null' - `var_v2_full') / `var_v2_null'
scalar R2_res = (`var_residual_null' - `var_residual_full') / `var_residual_null'

display "R2 for v2: " R2_v2 /* The random effect (v2) explains only about 11.8% of the variance in the dependent variable (ind_int) after accounting for the predictors in the full model. This suggests that adding the predictors (field_start_my and desgroups) does not significantly improve the variance explained by the random effect, and the random effect plays a minor role in explaining the outcome. */

display "R2 for residual: " R2_res /* The residual variance is reduced by about 7.4% when the predictors are included in the model. This suggests that the full model does a better job of explaining the variation in ind_int, especially in terms of the residual variance, but still leaves a significant amount of unexplained variance */


*** Wald test - desgroups
mixed ind_int c.field_start_my##c.field_start_my ib5.desgroups || v2:, vce(cluster v2)
testparm ib5.desgroups /* desgroups sign. improve the model */

*** AIC/BIC - desgroups
mixed ind_int c.field_start_my##c.field_start_my || v2:, mle vce(cluster v2)
estimates store reduced

mixed ind_int c.field_start_my##c.field_start_my ib5.desgroups || v2:, mle vce(cluster v2)
estimates store full

estimates stats full reduced /* the full model fits better - AIC and BIC lower*/


*** AIC/BIC - quandratic x linerar term for field_start_my 
mixed ind_int c.field_start_my##c.field_start_my ib5.desgroups || v2:, vce(cluster v2)
estimates store quadr

mixed ind_int c.field_start_my ib5.desgroups || v2:, vce(cluster v2)
estimates store lin

estimates stats quadr lin


*** LR test of random effects (v2)
mixed ind_int c.field_start_my##c.field_start_my ib5.desgroups || v2:, mle
estimates store re

mixed ind_int c.field_start_my##c.field_start_my ib5.desgroups, mle
estimates store nore

lrtest re nore /* RE justified - the random intercepts for v2 account for meaningful variability in the outcome ind_int*/


*** robustness check - restricted interpretability index - Appendix Table A2
* restricted index
mixed ind_int_4 c.field_start_my##c.field_start_my ib5.desgroups || v2:, vce(cluster v2)
outreg2 using "${resultsdir}/TableA2_multi_robustcheck_interpret.doc", replace label ctitle(Interpret_ind_4) dec(3)

* full index
mixed ind_int c.field_start_my##c.field_start_my ib5.desgroups || v2:, vce(cluster v2)
outreg2 using "${resultsdir}/TableA2_multi_robustcheck_interpret.doc", label ctitle(Interpret_ind_5) dec(3) append


*** calendar time effect - predicted values - Appendix Figure A1
* regression models
eststo clear
eststo: mixed ind_int c.field_start_my##c.field_start_my || v2:, vce(cluster v2)				
margins, at(field_start_my=(722(1)743)) noestimcheck saving(file1, replace)

eststo: mixed ind_int c.field_start_my##c.field_start_my ib5.desgroups || v2:, vce(cluster v2)			
margins, at(field_start_my=(722(1)743)) noestimcheck saving(file2, replace)

esttab, not label r2 obslast star(+ 0.10 * 0.05 ** 0.01) nodepvars nonumbers mtitles("1" "2") title("Effect of sampling and mode on interpretability")	

* figure A1
combomarginsplot file1 file2, ///
	labels("unadjusted" "adjusted by survey design") ///
	noci ///
	file1opts(recast(line) lcolor(gs10) lpattern(solid)) ///
	file2opts(recast(line) lcolor(gs0) lpattern(solid)) ///
 	xtitle("Month of fieldwork start") ///
	ytitle("Predicted mean") ///
	xlabel(722(3)743, valuelabel) ///
	ylabel(3.4(0.1)3.7, grid format(%9.2f) gmin angle(horizontal)) ///
	legend(colfirst) ///
	title("", justification(left)) ///
	scheme(s1mono)
graph export ${resultsdir}\figurea1_multi_interpret_app.emf, replace
graph export ${resultsdir}\figurea1_multi_interpret_app.eps, replace



**# 4.2) accessibility first results *******************************************
snapshot restore 3

*** Choosing a specification for modelling process time ***
*comparison of AIC/BIC criterion
eststo clear
eststo: melogit event c.msfs_c || v2:, vce(cluster v2)									//1) linear
eststo: melogit event c.msfs_c##c.msfs_c || v2:, vce(cluster v2)						//2) quadratic
eststo: melogit event c.msfs_c##c.msfs_c##c.msfs_c || v2:, vce(cluster v2)				//3) cubic
eststo: melogit event c.msfs_c##c.msfs_c##c.msfs_c##c.msfs_c || v2:, vce(cluster v2)	//4) fourth order
eststo: melogit event c.msfs_log || v2:, vce(cluster v2)								//5) analogous to Weibull model (https://www.stata.com/manuals13/stdiscrete.pdf)
eststo: melogit event c.msfs c.msfs_log || v2:, vce(cluster v2)							//6) linear + log
eststo: melogit event c.msfs##c.msfs_log || v2:, vce(cluster v2)						//7) linear ## log
eststo: melogit event c.msfs_log##c.msfs_log || v2:, vce(cluster v2)					//8) log ## log
eststo: melogit event i.msfs_k1 || v2:, vce(cluster v2)									//9) Dummy coding
esttab, aic bic not label pr2 obslast star(+ 0.10 * 0.05 ** 0.01) nodepvars nonumbers mtitles("1" "2" "3" "4" "5" "6" "7" "8")


*** Choosing a specification for modelling calendar time dependency ***
*comparison of AIC/BIC (I)
eststo clear
eststo: melogit event c.msfs_log##c.msfs_log c.fs_myc || v2:, vce(cluster v2)									//1) linear
eststo: melogit event c.msfs_log##c.msfs_log c.fs_myc##c.fs_myc || v2:, vce(cluster v2)							//2) quadratic
eststo: melogit event c.msfs_log##c.msfs_log c.fs_myc##c.fs_myc##c.fs_myc || v2:, vce(cluster v2)				//3) cubic
eststo: melogit event c.msfs_log##c.msfs_log c.fs_myc##c.fs_myc##c.fs_myc##c.fs_myc || v2:, vce(cluster v2)		//4) fourth order
esttab, aic bic not label pr2 obslast star(+ 0.10 * 0.05 ** 0.01) nodepvars nonumbers mtitles("1" "2" "3" "4")


*** Models ***
*** model 3 - Table 2
melogit event c.msfs_log##c.msfs_log c.fs_myc##c.fs_myc##c.fs_myc || v2:, vce(cluster v2)
estimates store model5
estat icc
estat ic
mat s=r(S)
local ll = s[1,3]      // Log-Likelihood
local aic = s[1,5]    // AIC
local bic = s[1,6]    // BIC
display "Log-Likelihood: `ll'"
display "AIC: `aic'"
display "BIC: `bic'"

outreg2 using "${resultsdir}/Table2_multi_accessibility_results.doc", replace label ctitle(Model 3) dec(3) addstat(Log-Likelihood, `ll', aic, `aic', bic, `bic')


*** model 4 - Table 2
melogit event c.msfs_log##c.msfs_log c.fs_myc##c.fs_myc##c.fs_myc ib5.desgroups || v2:, vce(cluster v2)
estimates store model6
estat icc
estat ic
mat s=r(S)
local ll = s[1,3]      // Log-Likelihood
local aic = s[1,5]    // AIC
local bic = s[1,6]    // BIC
display "Log-Likelihood: `ll'"
display "AIC: `aic'"
display "BIC: `bic'"
outreg2 using "${resultsdir}/Table2_multi_accessibility_results.doc", label ctitle(Model 4) dec(3) append addstat(Log-Likelihood, `ll', aic, `aic', bic, `bic')


*** Wald tests
melogit event c.msfs_log##c.msfs_log c.fs_myc##c.fs_myc##c.fs_myc ib5.desgroups || v2:, vce(cluster v2)
testparm ib5.desgroups
testparm c.fs_myc##c.fs_myc##c.fs_myc


*** Deriving predicted probabilities for Figure A1 (Appendix) ***
melogit event c.msfs_log##c.msfs_log c.field_start_my##c.field_start_my##c.field_start_my || v2:, vce(cluster v2)
margins, at(field_start_my=(1(1)22)) noestimcheck saving(file1, replace)
melogit event c.msfs_log##c.msfs_log c.field_start_my##c.field_start_my##c.field_start_my ib5.desgroups || v2:, vce(cluster v2)
margins, at(field_start_my=(1(1)22)) noestimcheck saving(file2, replace)

combomarginsplot file1 file2, ///
	labels("unadjusted" "adjusted by survey design") ///
	noci ///
	file1opts(recast(line) lcolor(gs10) lpattern(solid)) ///
	file2opts(recast(line) lcolor(gs0) lpattern(solid)) ///
 	xtitle("Month of fieldwork start") ///
	ytitle("Probability") ///
	xtick(1(1)22) ///
	xlabel(1 "Mar. 20" 3 "May 20" 5 "July 20" 7 "Sept. 20" 9 "Nov. 20" 11 "Jan. 21" 13 "Mar. 21" 15 "May 21" 17 "July 21" 19 "Sept. 21" 21 "Nov. 21", labsize(small)) ///
	ylabel(0.1(0.05)0.4, grid format(%9.2f) gmin angle(horizontal)) ///
	title("") ///
	/*subtitle ("unadjusted vs. adjusted by survey design", justification(left))*/ ///
	legend(colfirst) ///
	scheme(s1mono)
graph export ${resultsdir}\figurea1_multi_access_results_app.emf, replace
graph export ${resultsdir}\figurea1_multi_access_results_app.eps, replace



**# 4.3) accessibility data ****************************************************
snapshot restore 5

*** Choosing a specification for modelling process time ***
*comparison of AIC/BIC criterion
eststo clear
eststo: melogit event c.msfs_c || v2:, vce(cluster v2) difficult intmethod(laplace)									//1) linear
eststo: melogit event c.msfs_c##c.msfs_c || v2:, vce(cluster v2) difficult intmethod(laplace)						//2) quadratic
eststo: melogit event c.msfs_c##c.msfs_c##c.msfs_c || v2:, vce(cluster v2) difficult intmethod(laplace)				//3) cubic
eststo: melogit event c.msfs_c##c.msfs_c##c.msfs_c##c.msfs_c || v2:, vce(cluster v2) difficult intmethod(laplace)	//4) fourth order
eststo: melogit event c.msfs_log || v2:, vce(cluster v2) difficult intmethod(laplace)								//5) analogous to Weibull model
eststo: melogit event c.msfs c.msfs_log || v2:, vce(cluster v2) difficult intmethod(laplace)						//6) linear + log
eststo: melogit event c.msfs##c.msfs_log || v2:, vce(cluster v2) difficult intmethod(laplace)						//7) linear ## log
eststo: melogit event c.msfs_log##c.msfs_log || v2:, vce(cluster v2) difficult intmethod(laplace)					//8) log ## log
eststo: melogit event i.msfs_k1 || v2:, vce(cluster v2)	difficult intmethod(laplace)								//9) Dummy coding
esttab, aic bic not label pr2 obslast star(+ 0.10 * 0.05 ** 0.01) nodepvars nonumbers mtitles("1" "2" "3" "4" "5" "6" "7" "8" "9")


*** Choosing a specification for modelling calendar time dependency ***
*comparison of AIC/BIC
eststo clear
eststo: melogit event c.msfs_c##c.msfs_c##c.msfs_c##c.msfs_c c.fs_myc || v2:, vce(cluster v2) difficult intmethod(laplace)			//1) linear
eststo: melogit event c.msfs_c##c.msfs_c##c.msfs_c##c.msfs_c c.fs_myc##c.fs_myc || v2:, vce(cluster v2) difficult intmethod(laplace)			//2) quadratic
eststo: melogit event c.msfs_c##c.msfs_c##c.msfs_c##c.msfs_c c.fs_myc##c.fs_myc##c.fs_myc || v2:, vce(cluster v2) difficult intmethod(laplace)		//3) cubic
eststo: melogit event c.msfs_c##c.msfs_c##c.msfs_c##c.msfs_c c.fs_myc##c.fs_myc##c.fs_myc##c.fs_myc || v2:, vce(cluster v2) difficult intmethod(laplace)			//4) fourth order
esttab, aic bic not label pr2 obslast star(+ 0.10 * 0.05 ** 0.01) nodepvars nonumbers mtitles("1" "2" "3" "4") title("XXX")


*** Models ***
*** model 5 - Table 2
melogit event c.msfs_c##c.msfs_c##c.msfs_c##c.msfs_c c.fs_myc##c.fs_myc || v2:, vce(cluster v2) difficult intmethod(laplace)
estimates store model5
estat icc
estat ic
mat s=r(S)
local ll = s[1,3]      // Log-Likelihood
local aic = s[1,5]    // AIC
local bic = s[1,6]    // BIC
display "Log-Likelihood: `ll'"
display "AIC: `aic'"
display "BIC: `bic'"

outreg2 using "${resultsdir}/Table2_multi_accessibility_data.doc", replace label ctitle(Model 5) dec(3) addstat(Log-Likelihood, `ll', aic, `aic', bic, `bic')


*** model 6 - Table 2
melogit event c.msfs_c##c.msfs_c##c.msfs_c##c.msfs_c c.fs_myc##c.fs_myc ib5.desgroups || v2:, vce(cluster v2) difficult intmethod(laplace)
estimates store model6
estat icc
estat ic
mat s=r(S)
local ll = s[1,3]      // Log-Likelihood
local aic = s[1,5]    // AIC
local bic = s[1,6]    // BIC
display "Log-Likelihood: `ll'"
display "AIC: `aic'"
display "BIC: `bic'"
outreg2 using "${resultsdir}/Table2_multi_accessibility_data.doc", label ctitle(Model 6) dec(3) append addstat(Log-Likelihood, `ll', aic, `aic', bic, `bic')


*** Wald tests
quietly melogit event c.msfs_c##c.msfs_c##c.msfs_c##c.msfs_c c.fs_myc##c.fs_myc ib5.desgroups || v2:, vce(cluster v2) difficult intmethod(laplace)
testparm ib5.desgroups
testparm c.fs_myc##c.fs_myc


*** Deriving predicted probabilities for Figure A1 (Appendix) ******************
melogit event c.msfs_c##c.msfs_c##c.msfs_c##c.msfs_c c.field_start_my##c.field_start_my || v2:, vce(cluster v2) difficult intmethod(laplace) // field_start_my - uncentered
margins, at(field_start_my=(1(1)22)) noestimcheck saving(file1, replace)
melogit event c.msfs_c##c.msfs_c##c.msfs_c##c.msfs_c c.field_start_my##c.field_start_my ib5.desgroups || v2:, vce(cluster v2) difficult intmethod(laplace) //field_start_my - uncentered
margins, at(field_start_my=(1(1)22)) noestimcheck saving(file2, replace)

combomarginsplot file1 file2, ///
	labels("unadjusted" "adjusted by survey design") ///
	noci ///
	file1opts(recast(line) lcolor(gs10) lpattern(solid)) ///
	file2opts(recast(line) lcolor(gs0) lpattern(solid)) ///
 	xtitle("Month of fieldwork start") ///
	ytitle("Probability") ///
	xtick(1(1)22) ///
	xlabel(1 "Mar. 20" 3 "May 20" 5 "July 20" 7 "Sept. 20" 9 "Nov. 20" 11 "Jan. 21" 13 "Mar. 21" 15 "May 21" 17 "July 21" 19 "Sept. 21" 21 "Nov. 21", labsize(small)) ///
	ylabel(0.1(0.01)0.17, grid format(%9.2f) gmin angle(horizontal)) ///
	title("") ///
	/*subtitle ("unadjusted vs. adjusted by survey design", justification(left))*/ ///
	legend(colfirst) ///
	scheme(s1mono)
graph export ${resultsdir}\figurea1_multi_access_data_app.emf, replace	
graph export ${resultsdir}\figurea1_multi_access_data_app.eps, replace



********************************************************************************
**# 4.4) Robustness check survey design-accessibility (missing publication date) ******
********************************************************************************
snapshot restore 1

*** prepare vars 
summ field_start_my
gen fs_myc = field_start_my - `r(mean)'	


*** define category of known publication but an uknown date 
gen res_3cat=results_avail
replace res_3cat=2 if results_avail==1 & results_my==.
label var res_3cat "Results published"
label define res_3cat 0 "No" 1 "Yes" 2 "Yes, but an unknown date", replace
label value res_3cat res_3cat
tab res_3cat

gen data_3cat=data_avail
replace data_3cat=2 if data_avail==1 & data_my==.
label var data_3cat "Data published"
label define data_3cat 0 "No" 1 "Yes" 2 "Yes, but an unknown date", replace
label value data_3cat data_3cat
tab data_3cat


*** robustness check - mixed logit models
melogit results_avail c.fs_myc##c.fs_myc##c.fs_myc ib5.desgroups if res_3cat!=2 || v2:, vce(cluster v2) difficult intmethod(laplace)
outreg2 using "${resultsdir}/TableA3_robustcheck_multi_access_res.doc", replace label ctitle(Results - without unknown date) dec(3)

melogit results_avail c.fs_myc##c.fs_myc##c.fs_myc ib5.desgroups || v2:, vce(cluster v2) difficult intmethod(laplace)
outreg2 using "${resultsdir}/TableA3_robustcheck_multi_access_res.doc", append label ctitle(Results - with unknown date) dec(3)

* data
melogit data_avail c.fs_myc##c.fs_myc ib5.desgroups if data_3cat!=2 || v2:, vce(cluster v2) difficult intmethod(laplace)
outreg2 using "${resultsdir}/TableA3_robustcheck_multi_access_data.doc", replace label ctitle(Results - without unknown date) dec(3)

melogit data_avail c.fs_myc##c.fs_myc ib5.desgroups || v2:, vce(cluster v2) difficult intmethod(laplace)
outreg2 using "${resultsdir}/TableA3_robustcheck_multi_access_data.doc", append label ctitle(Results - with unknown date) dec(3)