*** SYNTAX for the paper "Measuring Gender and Sex in Surveys: Lessons Learned fromm 50 Years of Cross-national Survey Data and Nonresponse Patterns" by Ilona Wysmulek - 2025

*** Figure 1

import excel "C:\Users\iwysmulek\Desktop\Measuring_sex_gender_DATA_Wysmulek_2025.xlsx", sheet("F1_labels_wave_level") firstrow clear

label variable survey "Survey project name"
label variable wave "Survey project wave"
label variable year "Interview year"

recode year (1966/1985=1) (1986/1989=2) (1990/1993=3) (1994/1997 = 4) (1998/2001 = 5) (2002/2005= 6) (2006/2009 = 7) (2010/2013 = 8) (2014/2017 = 9), gen (year_cat9)

label variable year_cat9 "Interview year (9 groups, 1966-2017)"
label define year_cat9 1 "1966-85" 2 "1986-89" 3 "1990-93" 4 "1994-97" 5 "1998-2001" 6 "2002-05"  7 "2006-09" 8 "2010-13" 9  "2014-17"
label values year_cat9 year_cat9

label variable sex_gend "Respondent's sex/gender: source terminology"
label variable codebook "Source codebook terminology"
label variable questionnaire "Source questionnaire terminology"
label variable data_dict "Source data dictionary terminology"

gen sg_label=1
replace sg_label = 2 if sex_gend=="gender"
replace sg_label = 3 if sex_gend=="discr"
label define sg_label 1 "sex" 2 "gender" 3 "sex/gender, interchangeably"
label values sg_label sg_label
label variable sg_label "Terminology used for respondent's sex/gender in source documentation"

recode sg_label (2/3=0), gen (slab)
recode sg_label (1=0) (3=0) (2=1), gen (glab)
recode sg_label (1=0) (3=1) (2=0), gen (dlab)
label define slab 1 "sex"
label variable slab "Sex: Terminology in source documentation"
label value slab slab
label define glab 1 "gender"
label variable glab "Gender: Termonilogy in source documentation"
label value glab glab
label define dlab 1 "sex/gender, interchangeably"
label variable dlab "Sex/gender, interchangeably: Terminology in source documentation: "
label value dlab dlab

bysort year_cat9: egen ms9= mean (slab)
bysort year_cat9: egen mg9= mean (glab)
bysort year_cat9: egen md9= mean (dlab)

label variable ms9 "Sex terminology: mean by grouped year"
label variable mg9 "Gender terminology: mean by grouped year"
label variable md9 "Sex/Gender terminology: mean by grouped year"

gen ps9= ms9*100
gen pg9= mg9*100
gen pd9= md9*100

label variable ps9 "Sex terminology: percentage of survey waves by grouped year"
label variable pg9 "Gender terminology: percentage by grouped year"
label variable pd9 "Sex/Gender terminology: percentage by grouped year"

graph bar ps9 pg9 pd9, over(year_cat9,  gap(*1) label(labsize(small))) blabel(total, color(black) size(vsmall) format(%9,0f)) stack graphregion(color(white))  ytitle("Percentage of survey waves, %", size(small)) ylabel(, labsize(small)) legend(order(1 "sex" 2 "gender" 3 "sex / gender, interchangeably") size(small) region(lstyle(none)) rows(1)) bar(1, color(gs6)) bar(2, color(sand)) bar(3, color(gs10))

*** Figures 4 / 5 

import excel "C:\Users\iwysmulek\Desktop\Measuring_sex_gender_DATA_Wysmulek_2025.xlsx", sheet("F4-5_missings_survey_level") firstrow clear

label variable survey "Survey project name"
label variable wave "Survey project wave"
label variable year "Interview year"
label variable country "Code for country/territory unit, level 2"
label variable flag_dk "Don't know: missing type on sex/gender"
label variable flag_othermiss "Other missing type on sex/gender"
label variable total_surv "Total number surveys per year"
label variable total_miss "Number of surveys with missings on sex/gender per year"

* Figure 4

twoway (line total_miss year, lcolor (black) lwidth(medium) graphregion(color(white)) ylabel(, labsize(vsmall)) xlabel(1966(5)2017, labsize(vsmall)) ytitle("Number of Surveys", size(small)) xtitle("") legend(label(1 "surveys with missings for sex/gender") size (small))) (line total_surv year, lcolor(gray) legend(label(2 "all surveys")))(scatter total_miss year, mcolor(black) msize(tiny) legend(order (1 2)) mcolor(black)) (scatter total_surv year, msize(tiny)  legend(order (1 2) region(lstyle(none)))  mcolor(gray))

* Figure 5

recode year (1966/1985=1) (1986/1989=2) (1990/1993=3) (1994/1997 = 4) (1998/2001 = 5) (2002/2005= 6) (2006/2009 = 7) (2010/2013 = 8) (2014/2017 = 9), gen (year_cat9)

label define year_cat9 1 "1966-85" 2 "1986-89" 3 "1990-93" 4 "1994-97" 5 "1998-2001" 6 "2002-05"  7 "2006-09" 8 "2010-13" 9  "2014-17"
label value year_cat9 year_cat9
label variable year_cat9 "Interview year (9 groups, 1966-2017)"

gen mtype=0
replace mtype=1 if flag_dk==1
replace mtype=2 if flag_othermiss==1
replace mtype=3 if flag_othermiss==1 & flag_dk==1

label define mtype 1 "don't know" 2 "other missings" 3 "both: don't know and other missings"
label values mtype mtype
label variable mtype "Type of missing data on sex/gender"

recode mtype (2/3=0), gen (dk)
recode mtype (1=0) (2=1) (3=0), gen (om)
recode mtype (1/2=0) (3=1), gen (both)

label define dk 1 "don't know" 
label values dk dk
label variable dk "Missing type: Don't know"

label define om 1 "Other missings"
label values om om
label variable om "Missing type: Other missings"

label define both 1 "Both: don't know and other missings"
label values both both
label variable both "Missing type: Both don't know and other missings"

bysort year_cat9: egen mdk= mean (dk)
bysort year_cat9: egen mom= mean (om)
bysort year_cat9: egen mboth= mean (both)

label variable mdk "Don't know missing type: mean by grouped year"
label variable mom "Other missing type: mean by grouped year"
label variable mboth "Both DK and OTHER missings: mean by grouped year"

gen pdk= mdk*100
gen pom= mom*100
gen pboth= mboth*100

label variable pdk "Don't know missing type: percentage by grouped year"
label variable pom "Other missing type: percentage by grouped year"
label variable pboth "Both DK and OTHER missings: percentage by grouped year"

graph bar pdk pom pboth, over(year_cat9,  gap(*1) label(labsize(small)))  stack graphregion(color(white))  ytitle("Percentage of survey waves, %", size(small)) ylabel(, labsize(small)) legend(order(1 "don't know" 2 "other missing" 3 "both: don't know and other missing") size(small) region(lstyle(none)) rows(1)) bar(1, color(gs6)) bar(2, color(sand)) bar(3, color(gs10))

*** Figure 6

import excel "C:\Users\iwysmulek\Desktop\Measuring_sex_gender_DATA_Wysmulek_2025.xlsx", sheet("F6-7_missings_prop_by_cntr") firstrow clear

label variable cntr_iso "Country ISO code"
label variable N_surveys "Total number of surveys per country"
label variable perc_missings "Percent of surveys with missings on sex/gender per country"
label variable country "Country/territory unit, level 2"
label variable clockposstyle "Technical variable to avoid label overlap"
label variable note "Note: Selected countries with more than five surveys in total (1966-2017) and at least one survey reporting missing data for sex/gender"

scatter N_surveys perc_missings, mlabel ( cntr_iso) mlabcolor(black) mlabvpos(clockposstyle) mlabsize(vsmall) msymbol(0) ytitle("Total Number of Surveys, 1966-2017", size(small)) xtitle("Percentage of Surveys with Any Missing Data for Sex/Gender", size(small)) xlabel(0(10)100, grid)  graphregion(color(white)) msymbol (x) ylabel(0(10)100, angle (0) format (%9.0f))

*** Figure 7

graph dot perc_missings if N_surveys >=20, over (cntr, sort (1) descending label(labsize(vsmall))) marker(1, msymbol(x) mcolor(black)) graphregion(color(white)) dots(mcolor(bg)) ylabel(0(5)80, labsize(vsmall)) ytitle("Percentage of Surveys with Any Missing Data for Sex/Gender", size (vsmall))
