//project: bias due to duplicated observations in surveys
//task   : it computes the percentage bias, the bias of the standard error,
//		   the DFbetas, and the RMSE for Scenario 2 Variant 1 (N = 500)

//housekeeping
//the dofiles saves its output to various subfolders: 
//datasets go in the subfolder "data"
//charts go in the subfolder "gph"
//tables go in the subfolder "tabs"
 
global dtapath " "
global dtapath "../data"
global docpath " "
global docpath "../docs"
global gphpath " "
global gphpath "../gph"
global tabpath " "
global tabpath "../tabs"
global num " "
global num "22"
global ver " " 
global ver "V01"

//the following globals are used to change the look of the charts
global gphopt " "
global gphopt "plotregion(lstyle(none)) scheme(s1mono)"
global noriquad " "
global noriquad "region(lstyle(none))"

//It runs the computations below only for Variant 1.
loc datasets " "
loc datasets " bigdata-mean-all "
//For Variant 1:
foreach d of local datasets{
//Scenario 2:
foreach  i in   5  13  27  {
use $dtapath/21-`d'-V01.dta, clear
keep estimate`i' stderr`i' p`i' min95`i' max95`i' parm indic
drop if parm == "dup"
save $dtapath/tmp-all.dta, replace
cap drop values
gen values = .


replace values = estimate`i' in 1/4
replace values = estimate`i'[1] if _n/4 - int(_n/4) == 0.25	
replace values = estimate`i'[2] if _n/4 - int(_n/4) == 0.5	
replace values = estimate`i'[3] if _n/4 - int(_n/4) == 0.75
replace values = estimate`i'[4] if _n/4 - int(_n/4) == 0

cap drop stdvalues
gen stdvalues = . 
replace stdvalues = stderr`i' in 1/4
replace stdvalues = stderr`i'[1] if _n/4 - int(_n/4) == 0.25	
replace stdvalues = stderr`i'[2] if _n/4 - int(_n/4) == 0.5	
replace stdvalues = stderr`i'[3] if _n/4 - int(_n/4) == 0.75
replace stdvalues = stderr`i'[4] if _n/4 - int(_n/4) == 0 

//it computes the  Df-beta scores and it rounds them
cap drop bias
gen bias = .
replace bias = round((estimate`i' - values)/stderr`i', 0.001)
 
cap drop parme
encode parm, gen(parme)

//it drops the bias for the true values because it is useless to occupy space
//on the chart.
drop if indic == 10
 
 
//it computes the results for all the considered measures
loc coeffs " "
loc coeffs "x z t _cons"
foreach c of loc coeffs{
loc solutions " "
loc solutions "11 12 13 14 15 16"
foreach s of loc solutions{
cap drop stdbias`i'`c'`s'
cap drop percbias`i'`c'`s'
cap drop covrate`i'`c'`s'
gen covrate`i'`c'`s' = .
sum estimate`i' if parm == "`c'" & indic == `s' 
loc betahat = r(mean)
loc SEhat =  r(sd) 
cap drop SEhat`i'`c'`s'
gen SEhat`i'`c'`s' = `SEhat'  if parm == "`c'" & indic == `s' 
cap drop Betahat`i'`c'`s'
gen Betahat`i'`c'`s' = `betahat'  if parm == "`c'" & indic == `s'  
sum values if parm == "`c'"  & indic == `s'
loc beta =  r(mean)
//Percentage and standardized bias
gen stdbias`i'`c'`s' = ((`betahat' - `beta') / `SEhat') * 100  if parm == "`c'" & indic == `s' 
gen percbias`i'`c'`s' = ((`betahat' - `beta') / `beta') * 100  if parm == "`c'" & indic == `s' 
//average standard errors
cap drop E_stderr`i'`c'`s'
egen E_stderr`i'`c'`s' = mean(stderr`i') if  parm == "`c'" & indic == `s' 
gen nE_stderr`i'`c'`s' =  E_stderr`i'`c'`s' / stdvalues if  parm == "`c'" & indic == `s' 
//RMSE
cap drop rmse`i'`c'`s'
gen rmse`i'`c'`s' = sqrt((`betahat' - `beta')^2 + (E_stderr`i'`c'`s')^2)  if parm == "`c'" & indic == `s' 
//NRMSE
cap drop nrmse`i'`c'`s'
gen nrmse`i'`c'`s' = ((sqrt((`betahat' - `beta')^2 + (E_stderr`i'`c'`s')^2))/`beta')*100  if parm == "`c'" & indic == `s' 
replace nrmse`i'`c'`s' = nrmse`i'`c'`s' * -1 if nrmse`i'`c'`s' < 0
}
}

//keeps only the relevant variables

keep indic bias parme parm stdbias* percbias*  nrmse*  values est* stder* SEhat* E_stderr*  nE_stderr*
cap drop duplications
gen duplications = `i'

save $dtapath/$num-`d'-$ver-`i'.dta, replace
}
}

//////////////////////////////////////////////////////////////////////////////////////////////////
// Results about Scenario 2 Variant 1: duplicates randomly chosen from the overall distribution //
//////////////////////////////////////////////////////////////////////////////////////////////////

//It appends all the results to a unique dataset
use $dtapath/$num-bigdata-mean-all-$ver-5.dta, clear
foreach  i in      13  27  {
append using  $dtapath/$num-bigdata-mean-all-$ver-`i'.dta
}
save $dtapath/$num-bigdata-mean-all-bias-$ver.dta, replace

 
lab define duplications 5 "5 doublets (2%)", modify
lab define duplications 13 "13 doublets (5%)", modify
lab define duplications 27 "27 doublets (10%)", modify
lab values duplications duplications

lab define indic 11 "a", modify
lab define indic 12 "b", modify
lab define indic 13 "c", modify
lab define indic 14 "d", modify
lab define indic 15 "e", modify
lab values indic indic 

lab define parme 1 "_cons", modify


graph box bias, over(indic) over(parme) by(duplications, note("") ///
	 cols(5) iscale(*0.75))   $gphopt yline(0.5 -0.5) ///
	 marker(1, ms(Oh)) box(1, fcolor(none) lcolor(black)) medline(lcolor(black)) ///
	yscale(range(-1.5(0.5)1.5)) ytitle("df-betas")  
graph export $gphpath/$num-bigbias-mean-$ver.eps, as(eps) preview(off) replace
 
save $dtapath/$num-bigdata-mean-all-bias-$ver.dta, replace


//table with diagnostics (percbias nrmse)

loc diags " "
loc diags "  nrmse   percbias  "
foreach g of loc diags{
use $dtapath/$num-bigdata-mean-all-bias-$ver.dta, clear
tempname diag`g'
postfile `diag`g''  str20 variable sol11 sol12 sol13 sol14 sol15    using "$dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta", replace 
loc vars " "
loc vars " x z t _cons"
	foreach v of loc vars{
		loc dup " "
		loc dup " 16 40 79"
			foreach d of loc dup{
			local varlab  ""`g'`d'`v'""
             display `varlab'
			loc solutions " "
			loc solutions " 11 12 13 14 15 "
			foreach s of loc solutions{
				 sum `g'`d'`v'`s'  if indic == `s' & parm  == "`v'"
				 loc sol`s' = r(mean) 
			}
			post `diag`g'' (`varlab') (`sol11') (`sol12') (`sol13') (`sol14') (`sol15')    
		}
}
postclose `diag`g''

use $dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta, clear 
format sol11 sol12 sol13 sol14 sol15  %9.1fc
 
list 
save $dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta, replace 
cd $tabpath 
dataout, save($num-bigdata-mean-all-$ver-bias-diag`g') tex replace
cd ../tmp
}


//table with diagnostics (nE_stderr)

loc diags " "
loc diags "nE_stderr"
foreach g of loc diags{
use $dtapath/$num-bigdata-mean-all-$ver-bias.dta, clear
tempname diag`g'
postfile `diag`g''  str20 variable sol11 sol12 sol13 sol14 sol15    using "$dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta", replace 
	loc vars " "
	loc vars " x z t _cons"
		foreach v of loc vars{
loc dup " "
		loc dup " 16 40 79"
foreach d of loc dup{
			local varlab  ""`g'`d'`v'""
             display `varlab'
			loc solutions " "
			loc solutions " 11 12 13 14 15 "
			foreach s of loc solutions{
				 sum `g'`d'`v'`s'  if indic == `s' & parm  == "`v'"
				 loc sol`s' = r(mean) * 100
			}
			post `diag`g'' (`varlab') (`sol11') (`sol12') (`sol13') (`sol14') (`sol15')   
		}
}
postclose `diag`g''

use $dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta, clear 
format sol11 sol12 sol13 sol14 sol15  %9.1fc
 

list 
save $dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta, replace 
cd $tabpath 
dataout, save($num-bigdata-mean-all-$ver-bias-diag`g') tex replace dec(1)
cd ../tmp
}
