//project: bias due to duplicated observations in surveys
//task   : it computes the percentage bias, the bias of the standard error,
//		   the DFbetas, and the RMSE for Scenario 1

//housekeeping
//the dofiles saves its output to various subfolders: 
//datasets go in the subfolder "data"
//charts go in the subfolder "gph"
//tables go in the subfolder "tabs"

global dtapath " "
global dtapath "../data"
global docpath " "
global docpath "../docs"
global gphpath " "
global gphpath "../gph"
global tabpath " "
global tabpath "../tabs"
global num " "
global num "12"
global ver " " 
global ver "V03"

//the following globals are used to change the look of the charts
global gphopt " "
global gphopt "plotregion(lstyle(none)) scheme(s1mono)"
global noriquad " "
global noriquad "region(lstyle(none))"

//It runs the computations below for each dataset produced. Each dataset corresponds
//to a different Variant.

loc datasets " "
loc datasets " bigdata-mean-all bigdata-mean2-all bigdata-low-all bigdata-high-all"
 
//For each Variant:
foreach d of local datasets{

//Scenario 1:
foreach i in 5 7 9 {
use $dtapath/11-`d'-V03.dta, clear
keep estimate`i' stderr`i' p`i' min95`i' max95`i' parm indic
drop if parm == "dup"
save $dtapath/tmp-all.dta, replace
cap drop values
gen values = .

replace values = estimate`i' in 1/4
replace values = estimate`i'[1] if _n/4 - int(_n/4) == 0.25	
replace values = estimate`i'[2] if _n/4 - int(_n/4) == 0.5	
replace values = estimate`i'[3] if _n/4 - int(_n/4) == 0.75
replace values = estimate`i'[4] if _n/4 - int(_n/4) == 0
 
cap drop stdvalues
gen stdvalues = . 
replace stdvalues = stderr`i' in 1/4
replace stdvalues = stderr`i'[1] if _n/4 - int(_n/4) == 0.25	
replace stdvalues = stderr`i'[2] if _n/4 - int(_n/4) == 0.5	
replace stdvalues = stderr`i'[3] if _n/4 - int(_n/4) == 0.75
replace stdvalues = stderr`i'[4] if _n/4 - int(_n/4) == 0

//it computes the  Df-beta scores and it rounds them
cap drop bias
gen bias = .
replace bias = round((estimate`i' - values)/stderr`i', 0.001)
 
cap drop parme
encode parm, gen(parme)

//it drops the bias for the true values because it is useless to occupy space
//on the chart.
drop if indic == 10

//it computes the results for all the considered measures
loc coeffs " "
loc coeffs "x z t _cons"
foreach c of loc coeffs{
loc solutions " "
loc solutions "11 12 13 14 15 16"
foreach s of loc solutions{
cap drop stdbias`i'`c'`s'
cap drop percbias`i'`c'`s'
cap drop covrate`i'`c'`s'
gen covrate`i'`c'`s' = .
sum estimate`i' if parm == "`c'" & indic == `s' 
loc betahat = r(mean)
loc SEhat =  r(sd) 
cap drop SEhat`i'`c'`s'
gen SEhat`i'`c'`s' = `SEhat'  if parm == "`c'" & indic == `s' 
cap drop Betahat`i'`c'`s'
gen Betahat`i'`c'`s' = `betahat'  if parm == "`c'" & indic == `s'  
sum values if parm == "`c'"  & indic == `s'
loc beta =  r(mean)
//Percentage and standardized bias
gen stdbias`i'`c'`s' = ((`betahat' - `beta') / `SEhat') * 100  if parm == "`c'" & indic == `s' 
gen percbias`i'`c'`s' = ((`betahat' - `beta') / `beta') * 100  if parm == "`c'" & indic == `s' 
//average standard errors
cap drop E_stderr`i'`c'`s'
egen E_stderr`i'`c'`s' = mean(stderr`i') if  parm == "`c'" & indic == `s' 
gen nE_stderr`i'`c'`s' =  E_stderr`i'`c'`s' / stdvalues if  parm == "`c'" & indic == `s' 
//RMSE
cap drop rmse`i'`c'`s'
gen rmse`i'`c'`s' = sqrt((`betahat' - `beta')^2 + (E_stderr`i'`c'`s')^2)  if parm == "`c'" & indic == `s' 
//NRMSE
cap drop nrmse`i'`c'`s'
gen nrmse`i'`c'`s' = ((sqrt((`betahat' - `beta')^2 + (E_stderr`i'`c'`s')^2))/`beta')*100  if parm == "`c'" & indic == `s' 
replace nrmse`i'`c'`s' = nrmse`i'`c'`s' * -1 if nrmse`i'`c'`s' < 0
}
}

//it keeps only the relevant variables

keep indic bias parme parm stdbias* percbias*  nrmse*  values est* stder* SEhat* E_stderr*  nE_stderr*
cap drop duplications
gen duplications = `i'

save $dtapath/$num-`d'-$ver-`i'.dta, replace
}
}

//////////////////////////////////////////////////////////////////////////////////////////////////
// Results about Scenario 1 Variant 1: duplicates randomly chosen from the overall distribution //
//////////////////////////////////////////////////////////////////////////////////////////////////

//It appends all the results to a unique dataset
use $dtapath/$num-bigdata-mean-all-$ver-5.dta, clear
foreach i in   7 9 {
append using  $dtapath/$num-bigdata-mean-all-$ver-`i'.dta
}
save $dtapath/$num-bigdata-mean-all-$ver-bias.dta, replace

lab define duplications 5 "0.4%", modify
lab define duplications 7 "0.5%", modify
lab define duplications 9 "0.6%", modify

lab values duplications duplications

lab define indic 11 "a", modify
lab define indic 12 "b", modify
lab define indic 13 "c", modify
lab define indic 14 "d", modify
lab define indic 15 "e", modify
lab values indic indic 

graph box bias, over(indic) over(parme) by(duplications, note("") ///
	 cols(5) iscale(*0.75))   $gphopt yline(0.5 -0.5) ///
	 marker(1, ms(Oh)) box(1, fcolor(none) lcolor(black)) medline(lcolor(black)) ///
	yscale(range(-1.5(0.5)1.5)) ytitle("df-betas")  
graph export $gphpath/$num-bigbias-mean-$ver.eps, as(eps) preview(off) replace

save $dtapath/$num-bigdata-mean-all-$ver-bias.dta, replace

//table with diagnostics (percbias nrmse)

loc diags " "
loc diags "  nrmse   percbias  "
foreach g of loc diags{
use $dtapath/$num-bigdata-mean-all-$ver-bias.dta, clear
tempname diag`g'
postfile `diag`g''  str20 variable sol11 sol12 sol13 sol14 sol15   using "$dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta", replace 
loc vars " "
loc vars " x z t _cons"
	foreach v of loc vars{
		loc dup " "
		loc dup " 5 7 9 "
			foreach d of loc dup{
			local varlab  ""`g'`d'`v'""
             display `varlab'
			loc solutions " "
			loc solutions " 11 12 13 14 15 "
			foreach s of loc solutions{
				 sum `g'`d'`v'`s'  if indic == `s' & parm  == "`v'"
				 loc sol`s' = r(mean) 
			}
			post `diag`g'' (`varlab') (`sol11') (`sol12') (`sol13') (`sol14') (`sol15')   
		}
}
postclose `diag`g''

use $dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta, clear 
format sol11 sol12 sol13 sol14 sol15  %9.1fc
 

list 
save $dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta, replace 
cd $tabpath 
dataout, save($num-bigdata-mean-all-$ver-bias-diag`g') tex dec(1) replace
cd ../tmp
}


//table with diagnostics (nE_stderr)

loc diags " "
loc diags "nE_stderr"
foreach g of loc diags{
use $dtapath/$num-bigdata-mean-all-$ver-bias.dta, clear
tempname diag`g'
postfile `diag`g''  str20 variable sol11 sol12 sol13 sol14 sol15    using "$dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta", replace 
	loc vars " "
	loc vars " x z t _cons"
		foreach v of loc vars{
loc dup " "
loc dup " 5 7 9 "
foreach d of loc dup{
			local varlab  ""`g'`d'`v'""
             display `varlab'
			loc solutions " "
			loc solutions " 11 12 13 14 15 "
			foreach s of loc solutions{
				 sum `g'`d'`v'`s'  if indic == `s' & parm  == "`v'"
				 loc sol`s' = r(mean) * 100
			}
			post `diag`g'' (`varlab') (`sol11') (`sol12') (`sol13') (`sol14') (`sol15')   
		}
}
postclose `diag`g''

use $dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta, clear 
format sol11 sol12 sol13 sol14 sol15  %9.1fc
 

list 
save $dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta, replace 
cd $tabpath 
dataout, save($num-bigdata-mean-all-$ver-bias-diag`g') tex replace dec(1)
cd ../tmp
}


////////////////////////////////////////////////////////////////////////////////////////////////////////
// Results about Scenario 1 Variant 2: duplicates randomly chosen around the mean of the distribution //
////////////////////////////////////////////////////////////////////////////////////////////////////////

//It appends all the results to a unique dataset
use $dtapath/$num-bigdata-mean2-all-$ver-5.dta, clear
foreach i in   7 9 {
append using  $dtapath/$num-bigdata-mean2-all-$ver-`i'.dta
}
save $dtapath/$num-bigdata-mean2-all-$ver-bias.dta, replace


 
lab define duplications 5 " 0.4%", modify
lab define duplications 7 " 0.5%", modify
lab define duplications 9 " 0.6%", modify
lab values duplications duplications

lab define indic 11 "a", modify
lab define indic 12 "b", modify
lab define indic 13 "c", modify
lab define indic 14 "d", modify
lab define indic 15 "e", modify
lab values indic indic 


graph box bias, over(indic) over(parme) by(duplications, note("") ///
	 cols(5) iscale(*0.75))   $gphopt yline(0.5 -0.5) ///
	 marker(1, ms(Oh)) box(1, fcolor(none) lcolor(black)) medline(lcolor(black)) ///
	yscale(range(-1.5(0.5)1.5)) ytitle("df-betas") 
graph export $gphpath/$num-bigbias-mean2-$ver.eps, as(eps) preview(off) replace
 
 
//table with diagnostics (percbias nrmse)

loc diags " "
loc diags " percbias nrmse "
foreach g of loc diags{
use $dtapath/$num-bigdata-mean2-all-$ver-bias.dta, clear
tempname diag`g'
postfile `diag`g''  str20 variable sol11 sol12 sol13 sol14 sol15    using "$dtapath/$num-bigdata-mean2-all-$ver-bias-diag`g'.dta", replace 
loc dup " "
loc dup " 5 7 9 "
foreach d of loc dup{
	loc vars " "
	loc vars " x z t _cons"
		foreach v of loc vars{
			local varlab  ""`g'`d'`v'""
             display `varlab'
			loc solutions " "
			loc solutions " 11 12 13 14 15 "
			foreach s of loc solutions{
				 sum `g'`d'`v'`s'  if indic == `s' & parm  == "`v'"
				 loc sol`s' = r(mean) 
			}
			post `diag`g'' (`varlab') (`sol11') (`sol12') (`sol13') (`sol14') (`sol15')   
		}
}
postclose `diag`g''

use $dtapath/$num-bigdata-mean2-all-$ver-bias-diag`g'.dta, clear 
format sol11 sol12 sol13 sol14 sol15  %9.1fc
 
list  
save $dtapath/$num-bigdata-mean2-all-$ver-bias-diag`g'.dta, replace 
cd $tabpath 
dataout, save($num-bigdata-mean2-all-$ver-bias-diag`g') tex dec(1) replace
cd ../tmp
}


//table with diagnostics (nE_stderr)

loc diags " "
loc diags "nE_stderr"
foreach g of loc diags{
use $dtapath/$num-bigdata-mean2-all-$ver-bias.dta, clear
tempname diag`g'
postfile `diag`g''  str20 variable sol11 sol12 sol13 sol14 sol15    using "$dtapath/$num-bigdata-mean-all-$ver-bias-diag`g'.dta", replace 
	loc vars " "
	loc vars " x z t _cons"
		foreach v of loc vars{
loc dup " "
loc dup " 5 7 9 "
foreach d of loc dup{
			local varlab  ""`g'`d'`v'""
             display `varlab'
			loc solutions " "
			loc solutions " 11 12 13 14 15 "
			foreach s of loc solutions{
				 sum `g'`d'`v'`s'  if indic == `s' & parm  == "`v'"
				 loc sol`s' = r(mean) * 100
			}
			post `diag`g'' (`varlab') (`sol11') (`sol12') (`sol13') (`sol14') (`sol15')   
		}
}
postclose `diag`g''

use $dtapath/$num-bigdata-mean2-all-$ver-bias-diag`g'.dta, clear 
format sol11 sol12 sol13 sol14 sol15  %9.1fc
 

list 
save $dtapath/$num-bigdata-mean2-all-$ver-bias-diag`g'.dta, replace 
cd $tabpath 
dataout, save($num-bigdata-mean2-all-$ver-bias-diag`g') tex replace dec(1)
cd ../tmp
}


////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Results about Scenario 1 Variant 3: duplicates randomly chosen below the lower tail of the distribution //
////////////////////////////////////////////////////////////////////////////////////////////////////////////

//It appends all the results to a unique dataset
use $dtapath/$num-bigdata-low-all-$ver-5.dta, clear
foreach i in   7 9 {
append using  $dtapath/$num-bigdata-low-all-$ver-`i'.dta
}
save $dtapath/$num-bigdata-low-all-$ver-bias.dta, replace


 
lab define duplications 5 " 0.4%", modify
lab define duplications 7 " 0.5%", modify
lab define duplications 9 " 0.6%", modify
lab values duplications duplications

lab define indic 11 "a", modify
lab define indic 12 "b", modify
lab define indic 13 "c", modify
lab define indic 14 "d", modify
lab define indic 15 "e", modify
lab values indic indic 


graph box bias, over(indic) over(parme) by(duplications, note("") ///
	 cols(5) iscale(*0.75))   $gphopt yline(0.5 -0.5) ///
	 marker(1, ms(Oh)) box(1, fcolor(none) lcolor(black)) medline(lcolor(black)) ///
	yscale(range(-1.5(0.5)1.5)) ytitle("df-betas")  
graph export $gphpath/$num-bigbias-low-$ver.eps, as(eps) preview(off) replace
 


//table with diagnostics (percbias nrmse)

loc diags " "
loc diags " percbias nrmse "
foreach g of loc diags{
use $dtapath/$num-bigdata-low-all-$ver-bias.dta, clear
tempname diag`g'
postfile `diag`g''  str20 variable sol11 sol12 sol13 sol14 sol15    using "$dtapath/$num-bigdata-low-all-$ver-bias-diag`g'.dta", replace 
loc vars " "
loc vars " x z t _cons"
foreach v of loc vars{
	loc dup " "
	loc dup " 5 7 9 "
		foreach d of loc dup{
			local varlab  ""`g'`d'`v'""
             display `varlab'
			loc solutions " "
			loc solutions "11 12 13 14 15 "
			foreach s of loc solutions{
				 sum `g'`d'`v'`s'  if indic == `s' & parm  == "`v'"
				 loc sol`s' = r(mean) 
			}
			post `diag`g'' (`varlab') (`sol11') (`sol12') (`sol13') (`sol14') (`sol15')   
		}
}
postclose `diag`g''

use $dtapath/$num-bigdata-low-all-$ver-bias-diag`g'.dta, clear 
format sol11 sol12 sol13 sol14 sol15   %9.1fc
 
list  
save $dtapath/$num-bigdata-low-all-$ver-bias-diag`g'.dta, replace 
cd $tabpath 
dataout, save($num-bigdata-low-all-$ver-bias-diag`g') tex dec(1) replace
cd ../tmp
}

//table with diagnostics (nE_stderr)


loc diags " "
loc diags "nE_stderr"
foreach g of loc diags{
use $dtapath/$num-bigdata-low-all-$ver-bias.dta, clear
tempname diag`g'
postfile `diag`g''  str20 variable sol11 sol12 sol13 sol14 sol15    using "$dtapath/$num-bigdata-low-all-$ver-bias-diag`g'.dta", replace 
	loc vars " "
	loc vars " x z t _cons"
		foreach v of loc vars{
loc dup " "
loc dup " 5 7 9 "
foreach d of loc dup{
			local varlab  ""`g'`d'`v'""
             display `varlab'
			loc solutions " "
			loc solutions "11 12 13 14 15 "
			foreach s of loc solutions{
				 sum `g'`d'`v'`s'  if indic == `s' & parm  == "`v'"
				 loc sol`s' = r(mean) * 100
			}
			post `diag`g'' (`varlab') (`sol11') (`sol12') (`sol13') (`sol14') (`sol15')   
		}
}
postclose `diag`g''

use $dtapath/$num-bigdata-low-all-$ver-bias-diag`g'.dta, clear 
format sol11 sol12 sol13 sol14 sol15   %9.1fc
 
list  
save $dtapath/$num-bigdata-low-all-$ver-bias-diag`g'.dta, replace 
cd $tabpath 
dataout, save($num-bigdata-low-all-$ver-bias-diag`g') tex replace dec(1)
cd ../tmp
}
 

////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Results about Scenario 1 Variant 4: duplicates randomly chosen above the upper tail of the distribution //
////////////////////////////////////////////////////////////////////////////////////////////////////////////

//It appends all the results to a unique dataset
use $dtapath/$num-bigdata-high-all-$ver-5.dta, clear
foreach i in   7 9 {
append using  $dtapath/$num-bigdata-high-all-$ver-`i'.dta
}
save $dtapath/$num-bigdata-high-all-$ver-bias.dta, replace


lab define duplications 5 " 0.4%", modify
lab define duplications 7 " 0.5%", modify
lab define duplications 9 " 0.6%", modify
lab values duplications duplications

lab define indic 11 "a", modify
lab define indic 12 "b", modify
lab define indic 13 "c", modify
lab define indic 14 "d", modify
lab define indic 15 "e", modify
lab values indic indic 


graph box bias, over(indic) over(parme) by(duplications, note("") ///
	 cols(5) iscale(*0.75))   $gphopt yline(0.5 -0.5) ///
	 marker(1, ms(Oh)) box(1, fcolor(none) lcolor(black)) medline(lcolor(black)) ///
	yscale(range(-1.5(0.5)1.5)) ytitle("df-betas")  
graph export $gphpath/$num-bigbias-high-$ver.eps, as(eps) preview(off) replace


save $dtapath/$num-bigdata-high-all-$ver-bias.dta, replace



//table with diagnostics (percbias nrmse)


loc diags " "
loc diags " percbias nrmse "
foreach g of loc diags{
use $dtapath/$num-bigdata-high-all-$ver-bias.dta, clear
tempname diag`g'
postfile `diag`g''  str20 variable sol11 sol12 sol13 sol14 sol15     using "$dtapath/$num-bigdata-high-all-$ver-bias-diag`g'.dta", replace 
loc vars " "
loc vars " x z t _cons"
	foreach v of loc vars{
		loc dup " "
		loc dup " 5 7 9 "
		foreach d of loc dup{
			local varlab  ""`g'`d'`v'""
             display `varlab'
			loc solutions " "
			loc solutions "  11 12 13 14 15 "
			foreach s of loc solutions{
				 sum `g'`d'`v'`s'  if indic == `s' & parm  == "`v'"
				 loc sol`s' = r(mean) 
			}
			post `diag`g'' (`varlab') (`sol11') (`sol12') (`sol13') (`sol14') (`sol15')   
		}
}
postclose `diag`g''

use $dtapath/$num-bigdata-high-all-$ver-bias-diag`g'.dta, clear 
format sol11 sol12 sol13 sol14 sol15    %9.1fc
 
list  
save $dtapath/$num-bigdata-high-all-$ver-bias-diag`g'.dta, replace 
cd $tabpath 
dataout, save($num-bigdata-high-all-$ver-bias-diag`g') tex dec(1) replace
cd ../tmp
}

//table with diagnostics (nE_stderr)

loc diags " "
loc diags "nE_stderr "
foreach g of loc diags{
use $dtapath/$num-bigdata-high-all-$ver-bias.dta, clear
tempname diag`g'
postfile `diag`g''  str20 variable sol11 sol12 sol13 sol14 sol15     using "$dtapath/$num-bigdata-high-all-$ver-bias-diag`g'.dta", replace 
	loc vars " "
	loc vars " x z t _cons"
		foreach v of loc vars{
loc dup " "
loc dup " 5 7 9 "
foreach d of loc dup{
			local varlab  ""`g'`d'`v'""
             display `varlab'
			loc solutions " "
			loc solutions "  11 12 13 14 15 "
			foreach s of loc solutions{
				 sum `g'`d'`v'`s'  if indic == `s' & parm  == "`v'"
				 loc sol`s' = r(mean) * 100
			}
			post `diag`g'' (`varlab') (`sol11') (`sol12') (`sol13') (`sol14') (`sol15')   
		}
}
postclose `diag`g''

use $dtapath/$num-bigdata-high-all-$ver-bias-diag`g'.dta, clear 
format sol11 sol12 sol13 sol14 sol15    %9.1fc
 
list  
save $dtapath/$num-bigdata-high-all-$ver-bias-diag`g'.dta, replace 
cd $tabpath 
dataout, save($num-bigdata-high-all-$ver-bias-diag`g') tex replace  dec(1)
cd ../tmp
}

//////////////////////////////////////////////////////////////////////////////
// it prepares the table with the results of acceptable bias for each case //
//////////////////////////////////////////////////////////////////////////////

use $dtapath/$num-bigdata-mean-all-$ver-bias.dta, clear

loc vars " "
loc vars " acceptperc "
levelsof indic, loc(scenario)
levelsof duplications, loc(ndup)
tempname Yab1tomany
postfile `Yab1tomany'   scenario overall mean low high using "$dtapath/$num-duplicates-$ver-Yab1tomany.dta", replace 
	foreach v of loc vars{
		foreach n of loc ndup{
			foreach s of loc scenario{
			use $dtapath/$num-bigdata-mean-all-$ver-bias.dta, clear
			sum acceptperc if duplications == `n' & indic == `s'
			loc tmp1 " "  //overall
			loc tmp1 = r(mean)
			use $dtapath/$num-bigdata-mean2-all-$ver-bias.dta, clear
			sum acceptperc if duplications == `n' & indic == `s'
			loc tmp2 " "  //mean
			loc tmp2 = r(mean)
			use $dtapath/$num-bigdata-low-all-$ver-bias.dta, clear
			sum acceptperc if duplications == `n' & indic == `s'
			loc tmp3 " "  //low
			loc tmp3 = r(mean)
			use $dtapath/$num-bigdata-high-all-$ver-bias.dta, clear
			sum acceptperc if duplications == `n' & indic == `s'
			loc tmp4 " "  //high
			loc tmp4 = r(mean)
			post `Yab1tomany'  (`s') (`tmp1') (`tmp2') (`tmp3') (`tmp4')
			}
		}
	}
postclose `Yab1tomany'

use $dtapath/$num-duplicates-$ver-Yab1tomany.dta, clear 
format overall mean low high  %3.1f
list  
keep in 1/5
xpose, clear
save $dtapath/$num-tmp1-$ver.dta,replace

use $dtapath/$num-duplicates-$ver-Yab1tomany.dta, clear 
format overall mean low high  %3.1f
list  
keep in 6/10
xpose, clear
save $dtapath/$num-tmp2-$ver.dta,replace

use $dtapath/$num-duplicates-$ver-Yab1tomany.dta, clear 
format overall mean low high  %3.1f
list  
keep in 11/15
xpose, clear
save $dtapath/$num-tmp3-$ver.dta,replace

use $dtapath/$num-tmp1-$ver.dta,clear
append using $dtapath/$num-tmp2-$ver.dta $dtapath/$num-tmp3-$ver.dta


save $dtapath/$num-duplicates-$ver-Yab1tomany.dta, replace 
cd $tabpath 
forvalues n = 1/5{
ren v`n' a`n'
}

gen tmp = _n
recode tmp (1 = .) (2 =1) (3 = 4) (4 =	7) (5 =	10) (6 =. ) (7 = 2) (8 = 5) (9 = 8) (10 = 11) (11 = .) (12 = 3) (13 = 6) (14 = 9) (15 =	12)
sort tmp
drop if tmp == .
drop tmp
dataout, save($num-duplicates-$ver-Yab1tomany) tex  replace
cd ../tmp



/////////////////////////////////////////////////////////////////////////
// 		TABLES FOR DESCRIPTIVE STATISTICS 
/////////////////////////////////////////////////////////////////////////
set seed 1234

clear
set obs 1500
mat sigma = [7.80 , -10.36 , 1.12 , 2.36 \ -10.36, 293.36, -1.28, 22.96 \ 1.12, -1.28, 5.83, -1.81 \ 2.36, 22.96, -1.81, 174.23]
mat m = (5.16, 48.06, 4.99, 39.97)
drawnorm w x z t, n(1500) mean(m) cov(sigma)
sum
cap drop epsilon
drawnorm epsilon, n(1500)
sum epsilon
cap drop y
gen y = 5.36 - 0.04 * x + 0.16 * z + 0.023 * t  + 2.5*epsilon  
lab var x "x"
lab var y "y"
lab var z "z"
lab var t "t"
save $dtapath/$num-desc-$ver.dta, replace

loc vars " "
loc vars " y x z t"
tempname miss
postfile `miss'  str50 variable mean sd min max obs missing using $dtapath/$num-desc-data-$ver, replace 
	foreach v of loc vars{
	loc varlab : variable label `v'
	cap drop mis`v' 
	gen mis`v' = (`v' == .)  
	sum mis`v'   
	loc missi = r(mean)
	sum `v'   
	post `miss' ("`varlab'") (r(mean)) (r(sd)) (r(min)) (r(max)) (r(N)) (`missi')
	}
postclose `miss'

use $dtapath/$num-desc-data-$ver, clear 
gen ndup = "Initial data set"
lab var ndup "N. of duplicates"

format mean sd min max missing %9.2fc
format obs %9.0fc
list  
save $dtapath/$num-desc-nodup-$ver.dta, replace 
cd $tabpath 
dataout, save($num-desc-nodup-$ver) tex replace
cd ../tmp

////////////////////////////////////////////////////////////////////////
// 		SCENARIO 1
////////////////////////////////////////////////////////////////////////

foreach i in 5 7 9 {
use $dtapath/$num-desc-$ver.dta, clear

sample 1 , count
cap drop dup	
gen dup = 1
expand `i'
save $dtapath/tmp1.dta, replace

use $dtapath/$num-desc-$ver.dta, clear
cap drop dup
gen dup = 0
append using  $dtapath/tmp1.dta

cap drop duptag
duplicates tag x y z t, gen(duptag)
replace duptag = duptag / `i'
lab var duptag "duplicates (flag)"
cap drop random
gen random = runiform() if duptag == 0
sort random
loc top = `i'
drop in 1/`top'
save $dtapath/$num-desc-scenario1-`i'-$ver, replace
}

foreach i in 5 7 9 {
use $dtapath/$num-desc-scenario1-`i'-$ver, clear
loc vars " "
loc vars " y x z t duptag"
tempname miss
postfile `miss'  str50 variable mean sd min max obs missing using $dtapath/$num-desc-scenario1-`i'-$ver, replace 
	foreach v of loc vars{
	loc varlab : variable label `v'
	cap drop mis`v' 
	gen mis`v' = (`v' == .)  
	sum mis`v'   
	loc missi = r(mean)
	sum `v'   
	post `miss' ("`varlab'") (r(mean)) (r(sd)) (r(min)) (r(max)) (r(N)) (`missi')
	}
postclose `miss'

use $dtapath/$num-desc-scenario1-`i'-$ver, clear 
gen ndup = "Initial data set"
lab var ndup "N. of duplicates"

format mean sd min max missing %9.2fc
format obs %9.0fc
list  
save $dtapath/$num-desc-scenario1-`i'-$ver.dta, replace 
cd $tabpath 
dataout, save($num-desc-scenario1-`i'-$ver) tex replace
cd ../tmp
}

////////////////////////////////////////////////////////////////////////
// 		SCENARIO 2
////////////////////////////////////////////////////////////////////////
 
foreach i in 16 40 79 {
use $dtapath/$num-desc-$ver.dta, clear


sample `i' , count
cap drop dup	
gen dup = 1
save $dtapath/tmp1.dta, replace

use $dtapath/$num-desc-$ver.dta, clear
cap drop dup
gen dup = 0
append using  $dtapath/tmp1.dta
 

cap drop duptag
duplicates tag x y z t, gen(duptag)
lab var duptag "duplicates (flag)"

cap drop random
gen random = runiform() if duptag == 0
sort random
loc top = `i'
drop in 1/`top'
save $dtapath/$num-desc-scenario2-`i'-$ver, replace
}

foreach i in 16 40 79  {
use $dtapath/$num-desc-scenario2-`i'-$ver, clear
loc vars " "
loc vars " y x z t duptag"
tempname miss
postfile `miss'  str50 variable mean sd min max obs missing using $dtapath/$num-desc-scenario2-`i'-$ver, replace 
	foreach v of loc vars{
	loc varlab : variable label `v'
	cap drop mis`v' 
	gen mis`v' = (`v' == .)  
	sum mis`v'   
	loc missi = r(mean)
	sum `v'   
	post `miss' ("`varlab'") (r(mean)) (r(sd)) (r(min)) (r(max)) (r(N)) (`missi')
	}
postclose `miss'

use $dtapath/$num-desc-scenario2-`i'-$ver, clear 


format mean sd min max missing %9.2fc
format obs %9.0fc
list  
save $dtapath/$num-desc-scenario2-`i'-$ver.dta, replace 
cd $tabpath 
dataout, save($num-desc-scenario2-`i'-$ver) tex replace
cd ../tmp
}


 
////////////////////////////////////////////////////////////////////////
// 		SCENARIO 3
////////////////////////////////////////////////////////////////////////
foreach  i in 7 16 31 {
use $dtapath/$num-desc-$ver.dta, clear

sample `i' , count
cap drop dup	
gen dup = 1
expand 4
save $dtapath/tmp1.dta, replace

use $dtapath/$num-desc-$ver.dta, clear
cap drop dup
gen dup = 0
append using  $dtapath/tmp1.dta

cap drop duptag
duplicates tag x y z t, gen(duptag)
replace duptag = duptag / 4
lab var duptag "duplicates (flag)"
cap drop random
gen random = runiform() if duptag == 0
sort random
loc top = `i'*4
drop in 1/`top'
save $dtapath/$num-desc-scenario3-`i'-$ver, replace
}

foreach i in 7 16 31 {
use $dtapath/$num-desc-scenario3-`i'-$ver, clear
loc vars " "
loc vars " y x z t duptag"
tempname miss
postfile `miss'  str50 variable mean sd min max obs missing using $dtapath/$num-desc-scenario3-`i'-$ver, replace 
	foreach v of loc vars{
	loc varlab : variable label `v'
	cap drop mis`v' 
	gen mis`v' = (`v' == .)  
	sum mis`v'   
	loc missi = r(mean)
	sum `v'   
	post `miss' ("`varlab'") (r(mean)) (r(sd)) (r(min)) (r(max)) (r(N)) (`missi')
	}
postclose `miss'

use $dtapath/$num-desc-scenario3-`i'-$ver, clear 


format mean sd min max missing %9.2fc
format obs %9.0fc
list  
save $dtapath/$num-desc-scenario3-`i'-$ver.dta, replace 
cd $tabpath 
dataout, save($num-desc-scenario3-`i'-$ver) tex replace
cd ../tmp
}
