//project: bias due to duplicated observations in surveys
//task: runs the analysis according to Scenario 2 using a simulated dataset of 1500 observations.
//		Duplicates are drown from the distribution of the Y

//housekeeping
//the dofiles saves its output to various subfolders: 
//datasets go in the subfolder "data"
//charts go in the subfolder "gph"
//tables go in the subfolder "tabs"

global dtapath " "
global dtapath "../data"
global docpath " "
global docpath "../docs"
global gphpath " "
global gphpath "../gph"
global tabpath " "
global tabpath "../tabs"
global num " "
global num "13"
global ver " " 
global ver "V03"

//the following globals are used to change the look of the charts
global gphopt " "
global gphopt "plotregion(lstyle(none)) scheme(s1mono)"
global noriquad " "
global noriquad "region(lstyle(none))"


loc iterations " "
loc iterations "2500"


//set the seed
set seed 1234

//creates a simulated dataset of N = 1500 observations 
clear
set obs 1500
//the covariance matrix and the average scores are taken from the European Social Survey. We used the pooled dataset from wave 6. 
//We used the following 4 variables:  hinctnta, agea, ppltrst wkhtot
mat sigma = [7.80 , -10.36 , 1.12 , 2.36 \ -10.36, 293.36, -1.28, 22.96 \ 1.12, -1.28, 5.83, -1.81 \ 2.36, 22.96, -1.81, 174.23]
mat m = (5.16, 48.06, 4.99, 39.97)

//randomly draws 1500 values for 4 variables according to the covariance matrix and the vector of means specified above.
drawnorm w x z t, n(1500) mean(m) cov(sigma)

//adds an error term
cap drop epsilon
drawnorm epsilon, n(1500)

//predicts the variable y according to the following regression. 
//Coefficients are taken from the European Social Survey as specified above.
cap drop y
gen y = 5.36 - 0.04 * x + 0.16 * z + 0.023 * t  + 2.5*epsilon  

corr x y z t, cov

save $dtapath/$num-bigdatabase-initial-$ver.dta, replace

//saves the results of the true regression
eststo: reg y x z t

parmest, saving($dtapath/tmp.dta, replace)  
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
//the variable indic keeps treak of the solutions adopted to deal with duplicated observations. 
//In this case indic = 10 indicates the true results, i.e. before including the duplicates.
cap drop indic
gen indic = 10
save $dtapath/$num-data-$ver.dta, replace

/////////////////////////////////////////////////////////////////////////////////
//
//			GENERIC CASE DUPLICATES RANDOMLY CHOSEN ON THE WHOLE DISTRIBUTION
//
/////////////////////////////////////////////////////////////////////////////////

//Scenario 2:
foreach  i in   16 40 79 {
//Repetitions = 2500
forvalues j = 1/`iterations'{
use $dtapath/$num-bigdatabase-initial-$ver.dta, clear
 

sample `i' , count
cap drop dup	
gen dup = 1
save $dtapath/tmp1.dta, replace

use $dtapath/$num-bigdatabase-initial-$ver.dta, clear
cap drop dup
gen dup = 0
append using  $dtapath/tmp1.dta
 

cap drop duptag
duplicates tag x y z t, gen(duptag)
cap drop random
gen random = runiform() if duptag == 0
sort random
loc top = `i'
drop in 1/`top'
save $dtapath/$num-bigdatabase-$ver, replace


 
// naive solution
reg y x z t
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 11 
//indic = 11 indicates the results from the naive solution
save $dtapath/d`j'.dta, replace

//dropping all duplicates
use $dtapath/$num-bigdatabase-$ver.dta, clear
reg y x z t if duptag == 0
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 12
//indic = 12 indicates the results after dropping all duplicates, i.e. reducing the sample size.
save $dtapath/ddrop`j'.dta, replace

//with dummy for all duplicates (flagging and controlling)
use $dtapath/$num-bigdatabase-$ver.dta, clear
reg y x z t  duptag  	  
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 13
//indic = 13 indicates the results after including a dummy for duplicated observations in the regression.
save $dtapath/du`j'.dta, replace

//drop all duplicates but one
use $dtapath/$num-bigdatabase-$ver.dta, clear
drop if dup == 1
reg y x z t 
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 14
//indic = 14 indicates the results after dropping all duplicates except one, i.e. a partial reduction of the sample size
save $dtapath/drreg`j'.dta, replace

//Weighted by the inverse of the multiplicities
use $dtapath/$num-bigdatabase-$ver.dta, clear
cap drop weightdup
gen weightdup = .
replace weightdup = 1 if duptag == 0
replace weightdup = 1/2 if duptag == 1
reg y x z t [pw = weightdup] 
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 15
//indic = 15 indicates the results of the regression weighted by the inverse of the numerosity of the duplicates
save $dtapath/dwreg`j'.dta, replace

}

//Appends all the results to one dataset
use $dtapath/$num-data-$ver.dta, clear
forvalues j = 1/`iterations' {
append using $dtapath/d`j'.dta
append using $dtapath/ddrop`j'.dta
append using $dtapath/du`j'.dta
append using $dtapath/drreg`j'.dta
append using $dtapath/dwreg`j'.dta
}

cap drop id
gen id = _n
save $dtapath/$num-bigdata-mean-$ver-`i', replace
}

//it merges all the data-sets with the results  for Scenario 2 
use $dtapath/$num-bigdata-mean-$ver-16.dta, clear
cap drop iddup
gen iddup = 16
foreach  i in     40 79 {
append using $dtapath/$num-bigdata-mean-$ver-`i'.dta
replace iddup = `i' if iddup == . 
}

//iddup == 16 indicates the dataset for Scenario 2 with 16 doublets
//iddup == 40 indicates the dataset for Scenario 2 with 40 doublets
//iddup == 79 indicates the dataset for Scenario 2 with 79 doublets

//changes the dataset from long to wide format
reshape wide estimate stderr p min95 max95, i(id) j(iddup) 
drop if parm == "duptag"

save $dtapath/$num-bigdata-mean-all-$ver.dta, replace

/////////////////////////////////////////////////////////////////////////////////
//
//			DUPLICATES CHOSEN AROUND THE MEAN
//
/////////////////////////////////////////////////////////////////////////////////
//Scenario 2
foreach  i in  16 40 79 {
//Repetitions = 2500
forvalues j = 1/`iterations'{
use $dtapath/$num-bigdatabase-initial-$ver.dta, clear

//sets the bandouries around the mean to select observations from the center
//of the distribution
sum y, detail
keep if y >= r(p25) & y <= r(p75)
save $dtapath/tmp.dta, replace

sample `i' , count
cap drop dup	
gen dup = 1
save $dtapath/tmp1.dta, replace

use $dtapath/$num-bigdatabase-initial-$ver.dta, clear
cap drop dup
gen dup = 0
append using  $dtapath/tmp1.dta


cap drop duptag
duplicates tag x y z t, gen(duptag)
cap drop random
gen random = runiform() if duptag == 0
sort random
loc top = `i'
drop in 1/`top'
save $dtapath/$num-bigdatabase-$ver, replace


 
// naive solution
reg y x z t
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 11 
//indic = 11 indicates the results from the naive solution
save $dtapath/d`j'.dta, replace

//dropping all duplicates
use $dtapath/$num-bigdatabase-$ver.dta, clear
reg y x z t if duptag == 0
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 12
//indic = 12 indicates the results after dropping all duplicates, i.e. reducing the sample size.
save $dtapath/ddrop`j'.dta, replace

//with dummy for all duplicates (flagging and controlling)
use $dtapath/$num-bigdatabase-$ver.dta, clear
reg y x z t  duptag  	  
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 13
//indic = 13 indicates the results after including a dummy for duplicated observations in the regression.
save $dtapath/du`j'.dta, replace

//drop all duplicates but one
use $dtapath/$num-bigdatabase-$ver.dta, clear
drop if dup == 1
reg y x z t 
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 14
//indic = 14 indicates the results after dropping all duplicates except one, i.e. a partial reduction of the sample size
save $dtapath/drreg`j'.dta, replace

//Weighted by the inverse of the multiplicities
use $dtapath/$num-bigdatabase-$ver.dta, clear
cap drop weightdup
gen weightdup = .
replace weightdup = 1 if duptag == 0
replace weightdup = 1/2 if duptag == 1
reg y x z t [pw = weightdup] 
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 15
//indic = 15 indicates the results of the regression weighted by the inverse of the numerosity of the duplicates
save $dtapath/dwreg`j'.dta, replace

}

//Appends all the results to one dataset
use $dtapath/$num-data-$ver.dta, clear
forvalues j = 1/`iterations' {
append using $dtapath/d`j'.dta
append using $dtapath/ddrop`j'.dta
append using $dtapath/du`j'.dta
append using $dtapath/drreg`j'.dta
append using $dtapath/dwreg`j'.dta
}

cap drop id
gen id = _n
save $dtapath/$num-bigdata-mean2-$ver-`i', replace
}

//it merges all the data-sets with the results  for Scenario 2 
use $dtapath/$num-bigdata-mean2-$ver-16.dta, clear
cap drop iddup
gen iddup = 16
foreach  i in      40 79 {
append using $dtapath/$num-bigdata-mean2-$ver-`i'.dta
replace iddup = `i' if iddup == . 
}

//iddup == 16 indicates the dataset for Scenario 2 with 16 doublets
//iddup == 40 indicates the dataset for Scenario 2 with 40 doublets
//iddup == 79 indicates the dataset for Scenario 3 with 79 doublets

//changes the dataset from long to wide format
reshape wide estimate stderr p min95 max95, i(id) j(iddup) 
drop if parm == "duptag"

save $dtapath/$num-bigdata-mean2-all-$ver.dta, replace



/////////////////////////////////////////////////////////////////////////////////
//
//			DUPLICATES CHOSEN AROUND THE LOWER TIE
//
/////////////////////////////////////////////////////////////////////////////////
 
//Scenario 2:
foreach  i in   16 40 79 {
//Repetitions = 2500
forvalues j = 1/`iterations'{
use $dtapath/$num-bigdatabase-initial-$ver.dta, clear

//sets the bandouries to select observations from the lower tie of 
//the distribution

sum y, detail
keep if y <= r(p25)
save $dtapath/tmp.dta, replace
sample `i' , count
cap drop dup	
expand 1
gen dup = 1
save $dtapath/tmp1.dta, replace


use $dtapath/$num-bigdatabase-initial-$ver.dta, clear
cap drop dup
gen dup = 0
append using  $dtapath/tmp1.dta


cap drop duptag
duplicates tag x y z t, gen(duptag)
cap drop random
gen random = runiform() if duptag == 0
sort random
loc top = `i'
drop in 1/`top'
save $dtapath/$num-bigdatabase-$ver, replace

 
// naive solution
reg y x z t
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 11 
//indic = 11 indicates the results from the naive solution
save $dtapath/d`j'.dta, replace

//dropping all duplicates
use $dtapath/$num-bigdatabase-$ver.dta, clear
reg y x z t if duptag == 0
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 12
//indic = 12 indicates the results after dropping all duplicates, i.e. reducing the sample size.
save $dtapath/ddrop`j'.dta, replace

//with dummy for all duplicates (flagging and controlling)
use $dtapath/$num-bigdatabase-$ver.dta, clear
reg y x z t  duptag  	  
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 13
//indic = 13 indicates the results after including a dummy for duplicated observations in the regression.
save $dtapath/du`j'.dta, replace

//Drop all duplicates but one
use $dtapath/$num-bigdatabase-$ver.dta, clear
drop if dup == 1
reg y x z t 
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 14
//indic = 14 indicates the results after dropping all duplicates except one, i.e. a partial reduction of the sample size
save $dtapath/drreg`j'.dta, replace

//Weighted by the inverse of the multiplicities
use $dtapath/$num-bigdatabase-$ver.dta, clear
cap drop weightdup
gen weightdup = .
replace weightdup = 1 if duptag == 0
replace weightdup = 1/2 if duptag == 1
reg y x z t [pw = weightdup] 
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 15
//indic = 15 indicates the results of the regression weighted by the inverse of the numerosity of the duplicates
save $dtapath/dwreg`j'.dta, replace
 
}

//Appends all the results to one dataset
use $dtapath/$num-data-$ver.dta, clear
forvalues j = 1/`iterations' {
append using $dtapath/d`j'.dta
append using $dtapath/ddrop`j'.dta
append using $dtapath/du`j'.dta
append using $dtapath/drreg`j'.dta
append using $dtapath/dwreg`j'.dta
}

cap drop id
gen id = _n
save $dtapath/$num-bigdata-low-$ver-`i', replace
}

//it merges all the data-sets with the results  for Scenario 1 
use $dtapath/$num-bigdata-low-$ver-16.dta, clear
cap drop iddup
gen iddup = 16
foreach  i in     40 79 {
append using $dtapath/$num-bigdata-low-$ver-`i'.dta
replace iddup = `i' if iddup == . 
}

//iddup == 16 indicates the dataset for Scenario 2 with 16 doublets
//iddup == 40 indicates the dataset for Scenario 2 with 40 doublets
//iddup == 79 indicates the dataset for Scenario 3 with 79 doublets

//changes the dataset from long to wide format
reshape wide estimate stderr p min95 max95, i(id) j(iddup) 
drop if parm == "duptag"

save $dtapath/$num-bigdata-low-all-$ver.dta, replace


/////////////////////////////////////////////////////////////////////////////////
//
//			DUPLICATES CHOSEN AROUND THE UPPER TIE
//
/////////////////////////////////////////////////////////////////////////////////

//Scenario 2:
foreach  i in  16 40 79 {
//Repetitions = 2500
forvalues j = 1/`iterations'{
use $dtapath/$num-bigdatabase-initial-$ver.dta, clear

//sets the bandouries to select observations from the upper tie of 
//the distribution
sum y, detail
keep if y >= r(p75)
save $dtapath/tmp.dta, replace
sample `i' , count
cap drop dup	
expand 1
gen dup = 1
save $dtapath/tmp1.dta, replace


use $dtapath/$num-bigdatabase-initial-$ver.dta, clear
cap drop dup
gen dup = 0
append using  $dtapath/tmp1.dta


cap drop duptag
duplicates tag x y z t, gen(duptag)
cap drop random
gen random = runiform() if duptag == 0
sort random
loc top = `i'
drop in 1/`top'
save $dtapath/$num-bigdatabase-$ver, replace
 
// naive solution
reg y x z t
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 11 
//indic = 11 indicates the results from the naive solution
save $dtapath/d`j'.dta, replace

//dropping all duplicates
use $dtapath/$num-bigdatabase-$ver.dta, clear
reg y x z t if duptag == 0
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 12
//indic = 12 indicates the results after dropping all duplicates, i.e. reducing the sample size.
save $dtapath/ddrop`j'.dta, replace

//with dummy for all duplicates (flagging and controlling)
use $dtapath/$num-bigdatabase-$ver.dta, clear
reg y x z t  duptag  	  
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 13
//indic = 13 indicates the results after including a dummy for duplicated observations in the regression.
save $dtapath/du`j'.dta, replace

//drop all duplicates but one
use $dtapath/$num-bigdatabase-$ver.dta, clear
drop if dup == 1
reg y x z t 
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 14
//indic = 14 indicates the results after dropping all duplicates except one, i.e. a partial reduction of the sample size
save $dtapath/drreg`j'.dta, replace

//Weighted by the inverse of the multiplicities
use $dtapath/$num-bigdatabase-$ver.dta, clear
cap drop weightdup
gen weightdup = .
replace weightdup = 1 if duptag == 0
replace weightdup = 1/2 if duptag == 1
reg y x z t [pw = weightdup] 
parmest, saving($dtapath/tmp.dta, replace) 	
use $dtapath/tmp.dta, clear
keep parm estimate stderr p min95 max95
cap drop indic
gen indic = 15
//indic = 15 indicates the results of the regression weighted by the inverse of the numerosity of the duplicates
save $dtapath/dwreg`j'.dta, replace
 
}

//Appends all the results to one dataset
use $dtapath/$num-data-$ver.dta, clear
forvalues j = 1/`iterations' {
append using $dtapath/d`j'.dta
append using $dtapath/ddrop`j'.dta
append using $dtapath/du`j'.dta
append using $dtapath/drreg`j'.dta
append using $dtapath/dwreg`j'.dta
}

cap drop id
gen id = _n
save $dtapath/$num-bigdata-high-$ver-`i', replace
}

//it merges all the data-sets with the results  for Scenario 1 
use $dtapath/$num-bigdata-high-$ver-16.dta, clear
cap drop iddup
gen iddup = 16
foreach  i in      40 79 {
append using $dtapath/$num-bigdata-high-$ver-`i'.dta
replace iddup = `i' if iddup == . 
}

//iddup == 16 indicates the dataset for Scenario 2 with 16 doublets
//iddup == 40 indicates the dataset for Scenario 2 with 40 doublets
//iddup == 79 indicates the dataset for Scenario 3 with 79 doublets

//changes the dataset from long to wide format
reshape wide estimate stderr p min95 max95, i(id) j(iddup) 
drop if parm == "duptag"

save $dtapath/$num-bigdata-high-all-$ver.dta, replace


