clear
set more off
capture log close
version 15

* Filepaths for data sets:
global d "IP11datafiles"
global j "temporaryfiles"

* Note: run the do-file cr_keystrokes-4.do before running this do-file


/* Data files needed for code to run:
****************************************
University of Essex, Institute for Social and Economic Research. (2021). 
Understanding Society: Innovation Panel, Waves 1-13, 2008-2020. 
[data collection]. 11th Edition. UK Data Service. 
SN: 6849, DOI: 10.5255/UKDA-SN-6849-14
*-- $d/k_indresp_ip: IP11 individual interview data file
*-- $d/k_hhsamp_ip: IP11 household interview data file 
*-- $j/timings: file derived from k_keystroke_paradata - IP11 paradata file
*****************************************/

/*
CONTENT
- merge on randomised allocation to mode of interview
- merge on time stamps data
- drop proxies, tel respondent, respondents who did not answer consent Q
- recode treatment allocation variables
- drop FTF respondents who did not complete the self-compltion questions
- recode variables
- keep only variables used
*/


* get variables from HHRESP - used for randomisatin checks
use $d/k_hhresp_ip, clear

tab k_tenure_dv, miss
recode k_tenure_dv (1=1 "Owned outright") (2=2 "Being bought with mortgage") (-9/-1 3/max=3 "Other"), gen(tenure)
tab k_tenure_dv tenure, miss

tab k_hhsize, miss     
recode k_hhsize (5/max=5), gen(hhsize2)
tab k_hhsize hhsize2, miss

keep k_hidp tenure hhsize2
sort k_hidp
save $j/hhresp, replace


use $d/k_indresp_ip, clear

* merge on HHRESP data
sort k_hidp
merge m:1 k_hidp using $j/hhresp
drop if _merge==2
drop _merge

* merge on randomised allocation to mode of interview
sort k_hidp
merge m:1 k_hidp using $d/k_hhsamp_ip, keepusing(k_hidp k_ff_gridmodew11 k_ff_lowwebw8)
keep if _merge==3
drop _merge
count

* merge on IP11 time stamps data
sort pidp
merge 1:1 pidp using $j/times
drop if _merge==2 // 4 cases in the timings data but not in the INDRESP data
tab k_indmode _merge // 2 respondents with missing timings data
drop _merge
count

* there are five duplicate cases in the keystrokes data
* check the interview data: which mode did they complete in? which consent Q?
list pidp k_consentq* k_indmode if pidp == 1714008092 | pidp == 1716292852 ///
	| pidp == 1716292892 | pidp == 1727982092 | pidp == 1729525652
* for each respondent: dropped the observation in the keystrokes data
* that does not correspond to the interview data in terms of
* consent question answered and mode os inteview

* merge on keystrokes data
sort pidp
merge 1:1 pidp using $j/keystrokes
drop if _merge==2
drop _merge

* check that the consent question versions in the keystrokes data match the survey data
tab1 k_consentq? if CQ==1, miss
tab1 k_consentq? if CQ==2, miss
tab1 k_consentq? if CQ==3, miss
tab1 k_consentq? if CQ==4, miss

* check that the indicators of mode in the keystrokes data match the survey data
*tab intleaf k_indmode, miss
*tab intdiag k_indmode, miss
tab wleaflet k_indmode, miss
tab wdiagram k_indmode, miss

// ok, the keystrokes and survey data match up

renpfix k_
*svyset psu, strata(strata)

* drop proxy respondents, keep only full respondents
tab ivfio
keep if ivfio==1 // 94 proxy interviews dropped
count

* drop tel respondents, keep only FTF and web respondents
tab indmode
tab indmode, nolab
drop if indmode==2 // drop 1 TEL respondent
count

* drop respondents in households with low web response propensity
* these are all allocated to FTF-first, rather than being allocated to mode randomly
tab ff_lowwebw8
tab ff_lowwebw8, nolab
drop if ff_lowwebw8==1
count


* drop respondents who did not answer consent question
tab1 consentq*
lab list k_consentq1
// -9 missing
// -2 refusal, -1 don't know: treat these as non-consent
drop if consentq1==-9 | consentq2==-9 | consentq3==-9 | consentq4==-9
* item non-response in the consent question
forvalues n=1/4 {
	tab consentq`n' if consentq`n'~=-8
	}


* recode treatment allocation variables
gen cdiff = 0 if condiffcapi==1 | condiffcawi==1
replace cdiff = 1 if condiffcapi==2 | condiffcawi==2
lab def cdiff 0 "Standard" 1 "Easy"
lab val cdiff cdiff
lab var cdiff "Difficulty of consent Q"
tab cdiff
tab condiffcapi condiffcawi, miss
tab indmode cdiff, miss

gen cloc = 0 if conlocstand==1 | conloceasy==1
replace cloc = 1 if conlocstand==2 | conloceasy==2
lab var cloc "Location of consent Q"
lab def cloc 0 "Early" 1 "Late"
lab val cloc cloc
tab cloc
tab conlocstand conloceasy, miss
tab indmode cloc // check: web respondents were all asked for consent late in questionnaire

* drop FTF respondents who did not answer 
* the self-completion module of the questionnaire
* containing the consent follow-up questions
* note: because of another experiment some respondents were routed into both
* the early and the late self-completion module!

gen sc = indmode==3 | (scacearly==1 & cloc==0) | (scac==1 & cloc==1) 
// web respondent or 
// early consent and early self-completion module accepted or
// late consent and late self-completion module accepted
tab sc
tab indmode sc // check: all web respondents completed self-completion according to this definition
tab cloc sc if indmode==1, row 
tab cloc sc if indmode==1, row nofreq chi
// check: no difference between the early and the late treatment groups 
// in the proportion of sample members who did not complete the self-completion section


* check that variables really are missing:
sum csundstd* ccnfdnc* cdcsn* coundstd* cdata* csensitive* ctrust* if sc==0
drop if sc==0
drop sc
count



* Recode variables:
****************************** 
* Combine variables that were asked early/late into one variable
tab1 consentq?
lab list k_consentq1 k_consentq2

* % of don't know/refused responses to consent questions
forvalues n=1/4{
	tab consentq`n' if consentq`n' ~=-8
	}

* % of don't know/refused - overall and by mode
gen consentmv = (consentq1==-1 | consentq1==-2 | ///
	consentq2==-1 | consentq2==-2 | ///
	consentq3==-1 | consentq3==-2 | ///
	consentq4==-1 | consentq4==-2)
tab consentmv
tab indmode consentmv, row nofreq chi
	

// item non-response rate in consent questions overall is 1.7%
// not significantly different bewteen web and ftf respondents
drop consentmv


gen consent=.
replace consent = 1 if (consentq1==1 | consentq2==1 | consentq3==1 | consentq4==1)
replace consent = 0 if (consentq1==2 | consentq1==-2 | consentq1==-1 | ///
	consentq2==2 | consentq2==-2 | consentq2==-1 | ///
	consentq3==2 | consentq3==-2 | consentq3==-1 | ///
	consentq4==2 | consentq4==-2 | consentq4==-1)
lab var consent "HMRC consent"
lab def consent 1 "Consent" 0 "No consent"
lab val consent consent
tab consent, miss

* mode of interview
recode indmode (1=0 "FTF") (3=1 "Web"), gen(mode)
lab var mode "Mode of interview"
tab indmode mode, miss

* mode allocation
recode ff_gridmodew11 (1=0 "FTF-first") (3=1 "Web-first"), gen(modealloc)
lab var modealloc "Mode allocation"
tab ff_gridmodew11 modealloc, miss

* consent follow-up questions asked early/late - variable names:
* suffix "1" if cloc==1 early (can only occur in f2f)
* suffix "2" if cloc==2 late (all web cases, some f2f cases)

* objective understanding
foreach x in a b c d e f g h {
clonevar coundstd`x'=coundstd1`x'
recode coundstd`x' (-8=.)
replace coundstd`x'=coundstd2`x' if coundstd`x'==.
}
tab1 coundstd?


* % of don't know/refused - overall and by mode 
foreach w in a b c d e f g h {
	gen objmv`w' = (coundstd`w'==-1 | coundstd`w'==-2 | coundstd`w'==-9) 
	}
sum objmv*
// between 6.9 and 8.2% of items not answered
foreach w in a b c d e f g h {
	tab indmode objmv`w', row nofreq chi
	}
	
bysort indmode: sum objmv*
// missing rate about 10 percentage points higher in web than ftf, for all test questions

capture drop nmv
gen nmv = objmva + objmvb + objmvc + objmvd + objmve + objmvf + objmvg + objmvh
lab var nmv "# test questions not answered"

*log using $j/testmissings.log, replace
* number of missing test items, by mode of interview:
********************************************************
bysort mode: tab nmv

* most of the web respondents who did not answer test questions 
* did answer the next question (cdata2):
********************************************************
tab cdata2 if nmv==8 & mode==1

* reason for missingness mostly refusal,
* some don't know, some missing:
********************************************************
tab1 coundstd? if nmv==8 & mode==1
*log close

drop nmv objmv?


* exclude dk/ref answers
*correct answers to obj knowledge score
gen obj1b = coundstda == 2 if coundstda>0
gen obj2b = coundstdb == 2 if coundstdb>0
gen obj3b = coundstdc == 1 if coundstdc>0
gen obj4b = coundstdd == 2 if coundstdd>0
gen obj5b = coundstde == 1 if coundstde>0
gen obj6b = coundstdf == 2 if coundstdf>0
gen obj7b = coundstdg == 1 if coundstdg>0
gen obj8b = coundstdh == 1 if coundstdh>0

egen nmiss = rowmiss(obj?b)

* missing rates by interview mode (reported in Data section about objund score)
tab nmiss mode, col 
bysort mode: tab nmiss

egen objund = rowtotal(obj*b) if nmiss==0 
lab var objund "No. correct answers to knowledge Qs, excl mvs"

recode objund (0/3=1 "No. correct answers: 0-3") (4=4 "4") (5=5 "5") (6=6 "6") (7/8=7 "7-8"), gen(objund2)
lab var objund2 "No. correct answers, grouped"
tab objund objund2, miss
tab objund2, miss


egen objundpart = rowtotal(obj*b), missing 
lab var objundpart "No. correct answers to knowledge Qs, incl partials"

list obj?b nmiss objund objundpart in 1/100

recode objundpart (0/3=1 "No. correct answers: 0-3") (4=4 "4") (5=5 "5") (6=6 "6") (7/8=7 "7-8"), gen(objundpart2)
lab var objundpart2 "No. correct answers incl partials, grouped"
tab objundpart objundpart2, miss
tab objundpart2, miss


* subj understanding, confidence , knows what HMRC data contain, sensitivity, interviewer observations (read leaflet, diagram)
foreach var in csundstd ccnfdnc cdata csensitive intcread intflread {
clonevar `var' = `var'2
replace `var'=`var'1 if `var'==-8
tab `var', miss
}

* allocate the missing observations to modal categories
* do this by mode of interview
tab csundstd mode
recode csundstd (-9/-1 = 3) if mode==0
recode csundstd (-9/-1 = 2) if mode==1
recode csundstd (1 2=0 "Do not/somewhat understand") (-9/-1 3 4=1 "Completely/mostly understand"), gen(subj)
lab var subj "Subjective understanding"
tab csundstd subj, miss

tab ccnfdnc mode
recode ccnfdnc (-9/-1 = 2) // cat 2 is modal category for web and FTF 
recode ccnfdnc (1 2 = 1 "(Very) confident in consent decision") (3 4 = 0 "Somewhat/not confident"), gen(conf)
lab var conf "Confidence in consent decision"
tab ccnfdnc conf, miss

tab cdata mode
recode cdata (-9/-1=2) // cat 2 is modal category for web and FTF 
recode cdata (1 2 = 1 "Knows (roughly) what data HMRC have") (3 4 = 0 "Does not know/no data about me"), gen(data)
lab var data "Whether knows what data HMRC have"
tab cdata data, miss

tab csensitive mode
recode csensitive (-9/-1=2) // cat 2 is modal category for web and FTF
recode csensitive (1 2 = 1 "HMRC data are (highly) sensitive") (3 4 5 = 0 "Somewhat/not sensitive/no data about me"), gen(sens)
lab var sens "Sensitivity of HMRC data"
tab csensitive sens, miss

* whether amount of information was enough/too much/too little
tab1 cinfo?, miss
clonevar info = cinfo1
replace info = cinfo2 if info==-8
tab info
replace info = 3 if info<0
tab info, miss


* whether read/clicked on information leaflet
tab intcread
tab intcread, nolab
tab intcread if intcread>-8
* code as 'read' if read fully, group the other categories as not read
gen leaflet = intcread==1 | wleaflet==1
lab var leaflet "Read/clicked on leaflet"
tab leaflet
tab indmode leaflet

* whether read/clicked on diagram
tab intflread
tab intflread, nolab
tab intflread if intflread>-8
* problem: this seems to depend on interviewer, whether they talked the respondent through the diagram
* code as 'read' if talked through diagram fully
gen diagram = intflread==1 | wdiagram==1
lab var diagram "Discussed/clicked on diagram"
tab diagram
tab indmode diagram

* consent decision process

forvalues x=1/4 {
	clonevar decision`x'=cdcsn1`x'
	replace decision`x'=cdcsn2`x' if decision`x'==-8
	local lab `: var label cdcsn2`x''
	lab var decision`x' "`lab'"
	}
tab1 decision*, miss

gen decision=. 
replace decision = 1 if decision1==1 & decision2~=1 & decision3~=1 & decision4~=1
replace decision = 2 if decision1~=1 & decision2==1 & decision3~=1 & decision4~=1
replace decision = 3 if decision1~=1 & decision2~=1 & decision3==1 & decision4~=1
*replace decision = 4 if decision1~=1 & decision2~=1 & decision3~=1 & decision4==1
*replace decision = 5 if decision==.
replace decision = 4 if decision==.
lab var decision "Consent decision process"
*lab def decision 1 "Pro,con" 2 "Gut feeling" 3 "As usual" 4"Something else" 5 "Other patterns or missing"
lab def decision 1 "Pro,con" 2 "Gut feeling" 3 "As usual" 4 "Sth else or combinations"
lab val decision decision
tab decision

gen decisionb=. 
replace decisionb = 1 if decision1==1 & decision2~=1 & decision3~=1 & decision4~=1
replace decisionb = 2 if decision1~=1 & decision2==1 & decision3~=1 & decision4~=1
replace decisionb = 3 if decision1~=1 & decision2~=1 & decision3==1 & decision4~=1
replace decisionb = 4 if decision1~=1 & decision2~=1 & decision3~=1 & decision4==1
replace decisionb = 5 if decisionb==.
lab var decisionb "Consent decision process"
lab def decisionb 1 "Pro,con" 2 "Gut feeling" 3 "As usual" 4"Something else" 5 "Other patterns or missing"
lab val decisionb decisionb
tab decisionb decision, miss

gen procon = decision==1
lab def procon 1 "Decision process: pro/con" 0 "No"
lab val procon procon
lab var procon "Decision process: pro/con"
*gen gut = decision==2
*lab var gut "Decision process: gut feeling"
*gen habit = decision==3
*lab var habit "Decision process: habit"
*gen selse = decision==4
*lab var selse "Decision process: something else"
*gen combi = decision==5
*lab var combi "Decision process: combinations"


* response times
gen time=.
replace time = time_consent1 if cdiff==0 & cloc==0
replace time = time_consent2 if cdiff==1 & cloc==0
replace time = time_consent3 if cdiff==0 & cloc==1
replace time = time_consent4 if cdiff==1 & cloc==1
tab time, miss
gen logtime = log(time)
lab var logtime "Log of response time"


* privacy concerns
tab privacy mode
recode privacy (-9/-1 = 2) // no missings in FTF
recode privacy (1 2=1 "Very/somewhat worried about privacy") (3 4=0 "Not very/at all worried") , gen(priv)
lab var priv "Privacy concerns"
tab privacy priv, miss


* data security concerns
tab dtascrty mode
recode dtascrty (-9/-1=2) // cat2 is modal category for FTF and web
recode dtascrty (1 2=1 "Very/somewhat concerned about data security") (3 4=0 "Not very/at all concerned"), gen(secur)
lab var secur "Data security concerns"
tab dtascrty secur, miss


* trust in survey
clonevar trust_surv = ctrust1a
replace trust_surv = ctrust2a if trust_surv==-8
lab var trust_surv "Trust in the survey"
tab trust_surv mode
recode trust_surv (-9/-1=3)
recode trust_surv (1 2 =0 "Not at all/a little") (3 4 =1 "Trusts the survey somewhat/a lot"), gen(tsurv)
lab var tsurv "Trust in the survey"
tab trust_surv tsurv, miss

* trust in HMRC
clonevar trust_hmrc = ctrust1c
replace trust_hmrc = ctrust2c if trust_hmrc==-8
lab var trust_hmrc "Trust in HMRC"
tab trust_hmrc mode
recode trust_hmrc (-9/-1=3)
recode trust_hmrc (1 2 =0 "Not at all/a little") (3 4 =1 "Trusts HMRC somewhat/a lot"), gen(thmrc)
lab var thmrc "Trust in HMRC"
tab trust_hmrc thmrc, miss

* trust in the University of Essex
clonevar trust_uni = ctrust1b
replace trust_uni = ctrust2b if trust_uni==-8
lab var trust_uni "Trust in University"
tab trust_uni mode
recode trust_uni (-9/-1=3)
recode trust_uni (1 2 =0 "Not at all/a little") (3 4 =1 "Trusts University somewhat/a lot"), gen(tuni)
lab var tuni "Trust in University"
tab trust_uni tuni, miss


* socio-demographics
tab sex, miss
tab sex, nolab
gen female = sex==2
lab def female 0 "Male" 1 "Female"
lab val female female
tab sex female, miss

tab pdvage, miss
recode pdvage (16/29=1 "16-29") (30/39=2 "30-39") (40/49=3 "40-49") ///
	(50/59=4 "50-59") (60/69=5 "60-69") (70/max=6 "70+")(-2/-1=.), gen(agegroup)
lab var agegroup "Age group"
tab pdvage agegroup, miss

recode pdvage (60/max=3 "60+") (41/59=2 "41-59") (16/40=1 "16-40") (-9/-1=.), gen(ageg)
tab pdvage ageg, miss


tab qfhigh_dv
recode qfhigh_dv (1/6=1 "degree or equivalent") (-9/-1 7/96=0 "no degree"), gen(degree)
lab var degree "Degree"
tab qfhigh_dv degree, miss
tab degree


recode qfhigh_dv (1/6=1 "Degree or equivalent") (7/11=2 "A/AS level") (-9/-1 12/99=3 "GCSE or lower"), gen(edu)
lab var edu "Education"
tab qfhigh_dv edu, miss



tab nchresp, miss

tab gor_dv, miss
mvdecode gor_dv, mv(-9)
tab gor_dv

tab jbstat, miss
recode jbstat (1 2=1 "In work") (-9/-1 3/97 = 0 "Not in work"), gen(inwork)
tab jbstat inwork, miss

tab frreadbk, miss


tab privacy, miss

tab dtascrty, miss



* device used to complete the survey
tab deviceused
lab list k_deviceused
recode deviceused (1=1 "PC/laptop/netbook") (2 3=2 "Tablet")(4 5=3 "Smartphone"), gen(device)
lab var device "Device used to complete web survey"
tab deviceused device, miss
replace device = . if indmode==1 // set device to missing for FTF respondents




* keep only variables used
keep pidp psu strata hidp mode modealloc cloc cdiff  ///
	consent time logtime decision decisionb procon  ///
	csundstd subj objund objund2 ccnfdnc conf cdata data csensitive sens info ///
	privacy priv dtascrty secur ///
	wleaflet wdiagram leaflet diagram ///
	trust_surv tsurv trust_hmrc thmrc trust_uni tuni ///
	female agegroup ageg degree edu nchresp inwork gor_dv frreadbk tenure hhsize2 intcread ///
	device pdvage ///
	indinip_xw indscip_xw ///
	objundpart objundpart2 ///
	hhorig
	
compress
sum
save $j/cr-4, replace

exit


		   






















