*****Set working directory and get relevant file
cd "/Users/evaaizpurua/Desktop/AT experiment"
use anonimiseddata, clear

**** Key descriptives
fre response_mode exp_group completion_status

* -----------------
* Incentive Variable
* -----------------
gen incentive = 0
replace incentive = 1 if exp_group == 2 | exp_group == 4
label define incentive_label 1 "High, 25 euro conditional incentive" 0 "Low, 10 euro conditional incentive"
label values incentive incentive_label
fre incentive exp_group
ta exp_group incentive

* -----------------
* Length Variable
* -----------------
gen length = 0
replace length = 1 if exp_group == 3 | exp_group == 4
label define length_label 1 "Longer" 0 "Shorter"
label values length length_label
fre length exp_group
ta exp_group length

* -----------------
* Response
* -----------------

* Testing interactions for completes 
ta completion_status response_mode
ta completion_status length, col chi2 V
logit completion_status length##incentive, or

* Testing interactions for response mode 
ta response_mode length, col chi2 V
logit response_mode length##incentive, or

* Test
generate last_answered = .
local num_questions = 305 
local i = 1
foreach var of varlist A1b-R1 {  
    replace last_answered = `i' if !missing(`var')
    local i = `i' + 1
}

ta last_answered length, col chi2 V


* -----------------
* Recontact
* -----------------
ta R1 length, col chi2 V
bysort response_mode: ta R1 length, col chi2 V
recode R1 (1=1)(2=0), gen(recont)
logit recont length##incentive, or

* Plot
clear
input str8 mode str8 length num_respondents proportion
"Paper" "Shorter" 53 29.1
"Web"   "Shorter" 354 67.8
"Paper" "Longer"  46 29.3
"Web"   "Longer"  317 65.8
end

reshape wide num_respondents proportion, i(length) j(mode) string
gen length_num = cond(length == "Shorter", 1, 2)

set scheme white_tableau

graph bar (mean) proportionPaper proportionWeb, over(length, gap(5) lab(labsize(medium))) ///
     blabel(bar, format(%9.1f) size(medium)) ///
     legend(label(1 "Paper") label(2 "Web") size(medium)) ///
     ytitle("Percentage", size(medium)) ylabel(0(20)100) 


* Import data and cleaning
import excel "/Users/evaaizpurua/Downloads/sample_strat.xlsx", sheet("Tabelle1") firstrow clear

drop if substr(serialnumber, 1, 1) == "A"
gen newvar = substr(serialnumber, 1, 1)
gen numvar = .
replace numvar = 1 if newvar == "B"
replace numvar = 2 if newvar == "C"
replace numvar = 3 if newvar == "D"
replace numvar = 4 if newvar == "E"
label define grouplabel 1 "Short, low" 2 "Short, high" 3 "Long, low" 4 "Long, high", replace
label values numvar grouplabel
list numvar serialnumber

* Creating 'length' and 'incentive' variables
gen length = .
replace length = 0 if numvar == 1 | numvar == 2 
replace length = 1 if numvar == 3 | numvar == 4 

gen incentive = .
replace incentive = 0 if numvar == 1 | numvar == 3 
replace incentive = 1 if numvar == 2 | numvar == 4 

ta length numvar
ta incentive numvar

* Clone 'serialnumber' into a new variable 'idno'
clonevar idno = serialnumber
list idno serialnumber

* Save sample data
save samplefile, replace

* Load 'anonimiseddata', create a 'respondent' variable, and save file
use anonimiseddata, clear
gen respondent = 1
save anonimiseddata, replace

* Load sample file and merge with anonimiseddata
use samplefile, clear
merge 1:1 idno using anonimiseddata, keep(match master) nogen
replace respondent = 0 if respondent == .

* Label length variable
label define length_label 1 "Longer" 0 "Shorter"
label values length length_label
ta respondent length, col chi2 V
logit respondent length##incentive, or 

* Conduct analysis excluding 'undeliverable' status
ta respondent length if status!="undeliverable", col chi2 V
tempfile data
save `data'
keep if status != "undeliverable"
tabulate length respondent, matcell(cell) matrow(row) 
proportion respondent, over(length)

* Calculate % gain and conducting logistic regression excluding 'undeliverable'
ta respondent length if status!="undeliverable", col chi2 V
logit respondent length##incentive if status!="undeliverable", or 


* Clear and input data for graphing
clear
input str8 length proportion se lower95 upper95
"Shorter" .3784057 .0108938 .3572952 .3999871
"Longer"  .3360202 .0106018 .315562  .3571126
end

* Creating a numerical variable for 'length', creating labels for the plot, and setting the graph scheme
gen length_num = cond(length == "Shorter", 1, 2)
gen prop_label = string(proportion*100, "%9.1f")
set scheme white_hue

* Plot
twoway (bar proportion length_num, barwidth(0.8)) ///
(rcap lower95 upper95 length_num, lcolor(black)) ///
(scatter proportion length_num, mlabel(prop_label) mlabsize(medium) mlabposition(10) mcolor("black")) , ///
xlabel(1 "Shorter (n=1,982)" 2 "Longer (n=1,985)", noticks labsize(medium)) ///
xtitle("Survey Length", size(medium)) ///
ytitle("Response Rate", size(medium)) ///
ylabel(0(0.1)0.5, labsize(medium)) ///
legend(off)


* Plot for response mode
clear
input str8 mode str8 length num_respondents proportion
"Paper" "Shorter" 201 26.80
"Web"   "Shorter" 549 73.20
"Paper" "Longer"  168 25.19
"Web"   "Longer"  499 74.81
end

reshape wide num_respondents proportion, i(length) j(mode) string
gen length_num = cond(length == "Shorter", 1, 2)

set scheme white_tableau

graph bar (mean) proportionPaper proportionWeb, over(length, gap(5) lab(labsize(medium))) stack ///
     blabel(bar, format(%9.1f) size(medium)) ///
     legend(label(1 "Paper") label(2 "Web") size(medium)) ///
     ytitle("Percentage", size(medium)) ylabel(0(20)100) 


* -----------------
* Sample Composition
* -----------------
use anonimiseddata, clear

* Incentive Variable
gen incentive = 0
replace incentive = 1 if exp_group == 2 | exp_group == 4
label define incentive_label 1 "High, 25 euro conditional incentive" 0 "Low, 10 euro conditional incentive"
label values incentive incentive_label
fre incentive exp_group
ta exp_group incentive

* Length Variable
gen length = 0
replace length = 1 if exp_group == 3 | exp_group == 4
label define length_label 1 "Longer" 0 "Shorter"
label values length length_label
fre length exp_group
ta exp_group length

* Country of birth. Distributions, cross-tabulation with survey length, and potential interaction effect.
fre A78 
ta A78, generate(g)
ta A78 length, col chi2 V
logit g1 length##incentive, or

* Sex. Distributions, cross-tabulation with survey length, and potential interaction effect.
ta C2, generate(sex)
ta C2 length, col chi2 V
logit sex1 length##incentive, or 

* Citizenship. Distributions, cross-tabulation with survey length, and potential interaction effect.
ta A77 length, col chi2 V
logit A77 length##incentive, or

* Age. Distributions, potential differences based on survey length, and potential interaction effect.
gen age_2021 = 2021 - C3_Year
sum age_2021
ttest age_2021, by(length) unequal
regress age_2021 length##incentive

* Age groups. Distributions, cross-tabulation with survey length, and potential interaction effect.
gen age_group = "" 
replace age_group = "17-29" if age_2021 >= 18 & age_2021 <= 29
replace age_group = "30-49" if age_2021 >= 30 & age_2021 <= 49
replace age_group = "50-64" if age_2021 >= 50 & age_2021 <= 64
replace age_group = "65+"   if age_2021 >= 65
ta age_group length, col chi2 V

* Education. Cleaning and preparing C13 for analysis.
replace C13 = subinstr(C13, ",", ".", .)
destring C13, replace force
replace C13 = 10 if C13 == 9.8
replace C13 = round(C13)
* Distributions, potential differences based on survey length, and potential interaction effect.
sum C13
ttest C13, by(length) unequal
regress C13 length##incentive

* Paid job. Distributions, cross-tabulation with survey length, and potential interaction effect.
fre C14_1
recode C14_1 (1=1) (.=0)
ta C14_1 length, col chi2 V
logit C14_1 length##incentive


****** Dissimilarity Indices Plots
* Clear and input data
clear
input str15 variable float d_short float d_long
"Country of birth" 10.4 9.8
"Citizenship" 10.3	11.0
"Sex" 2.6 1.9
"Age" 4.3	4.4
"Paid work" 7.7	10.1
end

* Reshape data and rename conditions
reshape long d_, i(variable) j(condition) string
replace condition = "Shorter" if condition == "short"
replace condition = "Longer" if condition == "long"
encode variable, gen(variable_num)

* Set color scheme
set scheme white_hue

* Variable sorting	
egen avg_d = mean(d_), by(variable)
gsort -avg_d
gen sortorder = _n

* Plot
graph bar d_, over(condition, gap(5) lab(labsize(small))) over(variable, sort(sortorder) gap(40) lab(labsize(medium))) ///
    legend(label(1 "Shorter" 2 "Longer") size(medium)) ///
    ytitle("Dissimilarity Index (%)", size(medium)) ///
    ylabel(0(5)15, format(%9.1f)) ///
    blabel(bar, format(%9.1f) size(medium))


* -----------------
* Data quality
* -----------------
use anonimiseddata, clear

* Incentive Variable
gen incentive = 0
replace incentive = 1 if exp_group == 2 | exp_group == 4
label define incentive_label 1 "High, 25 euro conditional incentive" 0 "Low, 10 euro conditional incentive"
label values incentive incentive_label
fre incentive exp_group
ta exp_group incentive

* Length Variable
gen length = 0
replace length = 1 if exp_group == 3 | exp_group == 4
label define length_label 1 "Longer" 0 "Shorter"
label values length length_label
fre length exp_group
ta exp_group length

*** Missing data analysis.
* Define the list of variables for missing data analysis
local myvars A1	A2	A3	A4	A5	A7	A9	A10	A11	A12	A13	A14	A15	A16	A17	///
A18	A19	A20	A21	A22	A23	A24	A25	A27	A28	A29	A30	A31	A32	A33	A34	A35	A38	A39	///
A40	A41	A42	A43	A44	A45	A46	A47	A48	A49	A50	A51	A52	A53	A54	A55	A56	A57	A58	A59	///
A60	A61	A62	A63	A64	A65	A66	A67	A68	A69	A70	A72	A73	A74	A75	A77	A78	A82	A83	A85	///
A87	C2	C7	C9	C10	C35	C38	C39	C41	C52	C55	C58_1 D63	D64	D65	D66	D67	D68	D69	///
D70	D71	D72	D73	D74	D75	D76	D77	D78	D79	D80	D81	D82	D83	D84	D85	D86	D87	D88	D89	///
D90	D92	D93	D94	D95	D97

* Compute the count of missing values across variables
egen missing_count = rowmiss(`myvars')

* Summary and statistics
sum missing_count
ttest missing_count, by(length) unequal
bysort response_mode: ttest missing_count, by(length) unequal
ttest missing_count, by(response_mode) unequal
anova missing_count length##incentive
table length response_mode, contents(mean missing_count sd missing_count) format(%8.2f)

* Nondifferentiation
respdiff cvhvs = cv(D63-D83)
table length incentive, contents(mean cvhvs sd cvhvs) format(%8.2f)
sum cvhvs

ttest cvhvs, by(length) unequal
ttest cvhvs, by(response_mode) unequal
bysort response_mode: ttest cvhvs, by(length) unequal
regress cvhvs length##incentive

* Correlations
spearman A47 A48
bysort length: spearman A47 A48
bootstrap r(rho), reps(1000) nodots : spearman A47 A48 if length==1
bootstrap r(rho), reps(1000) nodots : spearman A47 A48 if length==0

* Alpha values: Institutional trust
bysort length: alpha A17-A24, item
bysort response_mode length: alpha A17-A24, item 


* Missing data plot.
*Create a dataset that captures the dissaggregated data.
clear
input str10 mode str10 group obs mean std_err std_dev lower95 upper95
"Paper" "Shorter" 201 5.09 .72 10.20 3.67 6.51
"Paper" "Longer" 168 6.21 .92 11.94 4.39 8.03
"Web" "Shorter" 549 2.50 .39 9.16 1.73 3.27
"Web" "Longer" 499 2.42 .33 7.46 1.77 3.08
end

* Convert string variable mode to numeric
gen mode_num = cond(mode == "Paper", 1, 2) 
gen modegroup = mode_num + cond(group == "Longer", 2, 0)

twoway (bar mean modegroup, barwidth(0.8)) ///
       (rcap lower95 upper95 modegroup if group == "Shorter", lcolor(black)) ///
       (rcap lower95 upper95 modegroup if group == "Longer", lcolor(black)) ///
       (scatter mean modegroup if group == "Shorter", mlabel(mean) mlabsize(medium) mlabposition(10) mcolor(		black) msymbol(none)) ///
       (scatter mean modegroup if group == "Longer", mlabel(mean) mlabsize(medium) mlabposition(10) mcolor(		black) msymbol(none)), ///
       ytitle("Average Missing Values", size(medium)) ///
       xtitle("") ///
       xlabel(1 "Shorter - Paper" 2 "Shorter - Web" 3 "Longer - Paper" 4 "Longer - Web", noticks labsize(		medium)) ///
       ylabel(0(1)10, labsize(medium)) legend(off)

* Nondifferentiation plot
clear
input str10 mode str10 group obs mean std_err std_dev lower95 upper95
"Paper" "Shorter" 191 0.42 .01 0.14 0.40 0.44
"Paper" "Longer" 144 0.45 .01 10.15 0.42 0.47
"Web" "Shorter" 534 0.43 .01 0.14 0.42 0.44
"Web" "Longer" 481 0.43 .01 0.13 0.41 0.44
end

* Convert string variable mode to numeric 
gen mode_num = cond(mode == "Paper", 1, 2) 
gen modegroup = mode_num + cond(group == "Longer", 2, 0)

* Set visualization scheme
set scheme white_hue

twoway (bar mean modegroup, barwidth(0.8)) ///
       (rcap lower95 upper95 modegroup if group == "Shorter", lcolor(black)) ///
       (rcap lower95 upper95 modegroup if group == "Longer", lcolor(black)) ///
       (scatter mean modegroup if group == "Shorter", mlabel(mean) mlabsize(medium) mlabposition(10) mcolor(		black) msymbol(none)) ///
       (scatter mean modegroup if group == "Longer", mlabel(mean) mlabsize(medium) mlabposition(10) mcolor(		black) msymbol(none)), ///
       ytitle("Average Coefficient of Variation", size(medium)) ///
       xtitle("") ///
       xlabel(1 "Shorter - Paper" 2 "Shorter - Web" 3 "Longer - Paper" 4 "Longer - Web", noticks labsize(		medium)) ///
       ylabel(0(0.1)0.8, labsize(medium)) legend(off)


* -----------------
* Breakoffs
* -----------------
use allstarts, clear
ta breakoff_status length, col chi2
