* -------------------------------------------------------------------------
* Stata do-file to article
* -------------------------------------------------------------------------
* Wenz, A., Jäckle, A., Burton, J., Couper, M. P., and Read, B. 
* Quality of expenditure data collected with a mobile receipt scanning app
* in a probability household panel
* -------------------------------------------------------------------------

* Set working directory ---------------------------------------------------
sysdir set PLUS "d:\home\user\ado\stbplus"
cd "I:\Research\Output quality\Data"

* Set colour scheme -------------------------------------------------------
set scheme s2mono

* Load data ---------------------------------------------------------------
use "Spending_Study_LCFS", clear

* Compute inverse probability weights -------------------------------------
// Fit logistic regression to compute propensity scores
logit source age age2 i.female i.degree income hhsize i.urban c.age#i.degree
predict ps
lab var ps "Propensity score"
margins, dydx(*) post

// Table A2. Logistic regression model of probability of being in the LCF
// sample as opposed to the Understanding Society Spending Study sample.
estout m1 using "I:\Research\Output quality\Tables\Table1.doc", replace margin cells(b(star fmt(3)) se(par fmt(2))) stats(N, fmt(3 0)) label

// Compute weights
gen ipw = 1/ps if source == 1
replace ipw = 1/(1-ps) if source == 0
lab var ipw "Inverse probability weight"

* Check inverse probability weights ---------------------------------------
// Identify outliers outside of range: mean +/- 3 * SD
// -- LCF
sum ipw if source == 1, d
di r(mean) + 3 * r(sd) // 1.359934
tab ipw source if source == 1 & ipw > 1.359934 // n = 16

sum ipw if source == 1, d
di r(mean) - 3 * r(sd) // 0.87617677
tab ipw source if source == 1 & ipw < 0.87617677 // n = 0

// -- Spending Study
sum ipw if source == 0, d
di r(mean) + 3 * r(sd) // 30.547148
tab ipw source if source == 0 & ipw > 30.547148 // n = 6

sum ipw if source == 0, d
di r(mean) - 3 * r(sd) // -11.952959
tab ipw source if source == 0 & ipw < -11.952959 // n = 0

// Recode extreme weights
gen ipw1 = ipw if source == 1 & ipw <= 1.359934
replace ipw1 = 1.359934 if source == 1 & ipw > 1.359934
replace ipw1 = ipw if source == 0 & ipw <= 30.547148
replace ipw1 = 30.547148 if source == 0 & ipw > 30.547148
lab var ipw1 "Trimmed inverse probability weight"

// Figure 1. Standardised differences in respondent characteristics before
// and after IPW.
lab var female "Gender: Female"
lab var degree "Education: Degree"
lab var urban "Urbanicity: Urban"
pbalchk source age female degree income urban hhsize, wt(ipw1) graph xline(-0.1 0.1)
graph export "I:\Research\Output quality\Figures\Figure1.pdf", as(pdf) replace

// Table A3. Sample composition and standardised differences before and
// after IPW.
xi: pbalchk source age i.female i.degree income i.urban hhsize
xi: pbalchk source age i.female i.degree income i.urban hhsize, wt(ipw1)

// Save data
save "Spending_Study_LCFS_with_weights", replace
