####################################################################################
# Code to process European census data and calculate Duncan Dissimilarity Indexes
#
# Variable: Marital status
#
#
# Author: Jan-Lucas Schanze
# Date:   07.10.2022
# Paper: (SRM submission #7830)
###################################################################################

### Preparing the data preparation and analysis

# 1. Store raw data (csv files) in a folder     -> directory will be defined as input below
# 2. Create folder for data outputs and graphs  -> directory will be defined as output below

### OVERVIEW OF VARIABLES

# gender:	        0 (male)
#                 1 (female)

# housing status: 1 (living in a private household)
#                 2 (institutional household)
#                 3 (primary homeless)
#                 4 (Not stated)

# mstat (OLD):	  1 (never married)
#                 2 (married)
#                 3 (widowed)
#                 4 (divorced)
#                 5 (registered partnership)
#                 6 (reg. part. widowed)
#                 7 (reg. part. dissolved)
#                 8 (not stated)

# mstat (NEW):	  1 (never married)
#                 2 (married)
#                 3 (widowed)
#                 4 (divorced)
#                 8 (not stated)


######## Installing packages (required once)

#install.packages("foreign")
#install.packages("haven")
#install.packages("Rcmdr")
#install.packages("readr")
#install.packages("lattice")
#install.packages("ggplot2")
#install.packages("tidyverse")
#install.packages("beanplot")
#install.packages("DAAG")
#install.packages("visreg")
#install.packages("HSAUR")
#install.packages("lme4")
#install.packages("readxl")
#install.packages("dplyr")
#install.packages("reshape2")
#install.packages(sjPlot)
#install.packages(mlmRev)
#install.packages("ggmosaic")
#install.packages("gridExtra")

###### Loading packages
library(foreign)
library(haven)
library(Rcmdr)
library(readr)
library(lattice)
library(ggplot2)
library(tidyverse)
library(beanplot)
library(DAAG)
library(visreg)
library(HSAUR)
library(lme4)
library(readxl)
library(dplyr)
library(reshape2)
library(ggmosaic)
library(gridExtra)

# Setting paths for analysis
getwd()

main.path <- "C:\\Users\\schanzjs\\Desktop\\source files\\data"  # change accordingly
save.path <- "C:\\Users\\schanzjs\\Desktop\\source files\\output" # change accordingly 

setwd(main.path)

# Loading dataset
hyper_mstat <- read_xlsx("censushub_age_gender_mstat.xlsx", sheet = "Data") # Loading dataset with labels
hyper_mstat[,1:10] # Have a look at columns 1 through 11

## Dropping countries not needed 
hyper_mstat <- subset(hyper_mstat, cntry!="Finland" & cntry!="Ireland") # Finland and Ireland have only missing values in institutions

# Recoding missing values for all variables
for(i in 1:ncol(hyper_mstat)){
  hyper_mstat[hyper_mstat[,i]=="NA",i] <-  NA
}

# Define variables as numeric
hyper_mstat$gender <- as.numeric(as.character(hyper_mstat$gender))
hyper_mstat$housing <- as.numeric(as.character(hyper_mstat$housing))
hyper_mstat$mstat <- as.numeric(as.character(hyper_mstat$mstat))

hyper_mstat$ageupto15 <- as.numeric(as.character(hyper_mstat$ageupto15))
hyper_mstat$age1529 <- as.numeric(as.character(hyper_mstat$age1529))
hyper_mstat$age3049 <- as.numeric(as.character(hyper_mstat$age3049))
hyper_mstat$age5064 <- as.numeric(as.character(hyper_mstat$age5064))
hyper_mstat$age6584 <- as.numeric(as.character(hyper_mstat$age6584))
hyper_mstat$age85plus <- as.numeric(as.character(hyper_mstat$age85plus))

# MERGING PARTNERSHIP CATEGORIES

# Married + Registered partnership (cat. 2 + 5)
hyper_married <- subset(hyper_mstat, mstat==2 | mstat==5)

hyper_married_agg <- aggregate(cbind(hyper_married$ageupto15,
                                     hyper_married$age1529,
                                     hyper_married$age3049,
                                     hyper_married$age5064,
                                     hyper_married$age6584,
                                     hyper_married$age85plus) ~ hyper_married$cntry + hyper_married$gender + hyper_married$housing, data = hyper_married, FUN = function(x) c(gama = sum(x)))

hyper_married_agg$mstat <- c(2) # generate variable mstat == 2 (married)

colnames(hyper_married_agg)[1] <- "cntry"
colnames(hyper_married_agg)[2] <- "gender"
colnames(hyper_married_agg)[3] <- "housing"
colnames(hyper_married_agg)[4] <- "ageupto15"
colnames(hyper_married_agg)[5] <- "age1529"
colnames(hyper_married_agg)[6] <- "age3049"
colnames(hyper_married_agg)[7] <- "age5064"
colnames(hyper_married_agg)[8] <- "age6584"
colnames(hyper_married_agg)[9] <- "age85plus"

# Widowed (cat. 3 + 6)
hyper_widowed <- subset(hyper_mstat, mstat==3 | mstat==6)

hyper_widowed_agg <- aggregate(cbind(hyper_widowed$ageupto15,
                                     hyper_widowed$age1529,
                                     hyper_widowed$age3049,
                                     hyper_widowed$age5064,
                                     hyper_widowed$age6584,
                                     hyper_widowed$age85plus) ~ hyper_widowed$cntry + hyper_widowed$gender + hyper_widowed$housing, data = hyper_widowed, FUN = function(x) c(gama = sum(x)))

hyper_widowed_agg$mstat <- c(3) # generate variable mstat == 3 (widowed)

colnames(hyper_widowed_agg)[1] <- "cntry"
colnames(hyper_widowed_agg)[2] <- "gender"
colnames(hyper_widowed_agg)[3] <- "housing"
colnames(hyper_widowed_agg)[4] <- "ageupto15"
colnames(hyper_widowed_agg)[5] <- "age1529"
colnames(hyper_widowed_agg)[6] <- "age3049"
colnames(hyper_widowed_agg)[7] <- "age5064"
colnames(hyper_widowed_agg)[8] <- "age6584"
colnames(hyper_widowed_agg)[9] <- "age85plus"

# Divorced (cat. 4 + 7)
hyper_divorced <- subset(hyper_mstat, mstat==4 | mstat==7)

hyper_divorced_agg <- aggregate(cbind(hyper_divorced$ageupto15,
                                     hyper_divorced$age1529,
                                     hyper_divorced$age3049,
                                     hyper_divorced$age5064,
                                     hyper_divorced$age6584,
                                     hyper_divorced$age85plus) ~ hyper_divorced$cntry + hyper_divorced$gender + hyper_divorced$housing, data = hyper_divorced, FUN = function(x) c(gama = sum(x)))

hyper_divorced_agg$mstat <- c(4) # generate variable mstat == 4 (divorced)

colnames(hyper_divorced_agg)[1] <- "cntry"
colnames(hyper_divorced_agg)[2] <- "gender"
colnames(hyper_divorced_agg)[3] <- "housing"
colnames(hyper_divorced_agg)[4] <- "ageupto15"
colnames(hyper_divorced_agg)[5] <- "age1529"
colnames(hyper_divorced_agg)[6] <- "age3049"
colnames(hyper_divorced_agg)[7] <- "age5064"
colnames(hyper_divorced_agg)[8] <- "age6584"
colnames(hyper_divorced_agg)[9] <- "age85plus"

## APPENDING 
hyper_mstat2 <- subset(hyper_mstat, mstat==1 | mstat==8)
hyper_mstat2 <- rbind(hyper_mstat2, hyper_married_agg, hyper_widowed_agg, hyper_divorced_agg)

## removing variables not required anymore
rm("hyper_divorced", "hyper_divorced_agg", "hyper_married", "hyper_married_agg", "hyper_widowed", "hyper_widowed_agg")


## AGGREGATING FOR THE TOTAL

# Aggregating all countries for European total
hyper_mstat_eu <- hyper_mstat2

## checking whether list of countries is correct
prop.table(table(hyper_mstat_eu$cntry, hyper_mstat_eu$gender), 2)

hyper_mstat_eu_agg <- aggregate(cbind(hyper_mstat_eu$ageupto15,
                                      hyper_mstat_eu$age1529,
                                      hyper_mstat_eu$age3049,
                                      hyper_mstat_eu$age5064,
                                      hyper_mstat_eu$age6584,
                                      hyper_mstat_eu$age85plus) ~ hyper_mstat_eu$gender + hyper_mstat_eu$housing + hyper_mstat_eu$mstat, data = hyper_mstat_eu, FUN = function(x) c(gama = sum(x)))

hyper_mstat_eu_agg$cntry <- c("Europe")

colnames(hyper_mstat_eu_agg)[1] <- "gender"
colnames(hyper_mstat_eu_agg)[2] <- "housing"
colnames(hyper_mstat_eu_agg)[3] <- "mstat"
colnames(hyper_mstat_eu_agg)[4] <- "ageupto15"
colnames(hyper_mstat_eu_agg)[5] <- "age1529"
colnames(hyper_mstat_eu_agg)[6] <- "age3049"
colnames(hyper_mstat_eu_agg)[7] <- "age5064"
colnames(hyper_mstat_eu_agg)[8] <- "age6584"
colnames(hyper_mstat_eu_agg)[9] <- "age85plus"

## Appending aggregated data
hyper_mstat2 <- rbind(hyper_mstat2, hyper_mstat_eu_agg)

# cleaning up
rm("hyper_mstat_eu", "hyper_mstat_eu_agg")

### dropping housing == 3 (= primary homeless) and housing == 4 (= Not stated)
hyper_mstat2 <- subset(hyper_mstat2, hyper_mstat2$housing == 1 | hyper_mstat2$housing == 2) # dropping housing types not required

### TOTALS by gender (all types of housing)
totals <- aggregate(cbind(hyper_mstat2$ageupto15,
                          hyper_mstat2$age1529,
                          hyper_mstat2$age3049,
                          hyper_mstat2$age5064,
                          hyper_mstat2$age6584,
                          hyper_mstat2$age85plus) ~ hyper_mstat2$cntry + hyper_mstat2$gender, data = hyper_mstat2, FUN = function(x) c(gama = sum(x)))

## renaming variables
colnames(totals)[1] <- "cntry"
colnames(totals)[2] <- "gender"
colnames(totals)[3] <- "totalupto15"
colnames(totals)[4] <- "total1529"
colnames(totals)[5] <- "total3049"
colnames(totals)[6] <- "total5064"
colnames(totals)[7] <- "total6584"
colnames(totals)[8] <- "total85plus"

## Total for entire population (by gender)
totals$total_allage <- totals$totalupto15 + totals$total1529 + totals$total3049 + totals$total5064 + totals$total6584 + totals$total85plus

### TOTALS by gender and housing
totals_within <- aggregate(cbind(hyper_mstat2$ageupto15,
                          hyper_mstat2$age1529,
                          hyper_mstat2$age3049,
                          hyper_mstat2$age5064,
                          hyper_mstat2$age6584,
                          hyper_mstat2$age85plus) ~ hyper_mstat2$cntry + hyper_mstat2$gender + hyper_mstat2$housing, data = hyper_mstat2, FUN = function(x) c(gama = sum(x)))

## renaming variables
colnames(totals_within)[1] <- "cntry"
colnames(totals_within)[2] <- "gender"
colnames(totals_within)[3] <- "housing"
colnames(totals_within)[4] <- "totwithinupto15"
colnames(totals_within)[5] <- "totwithin1529"
colnames(totals_within)[6] <- "totwithin3049"
colnames(totals_within)[7] <- "totwithin5064"
colnames(totals_within)[8] <- "totwithin6584"
colnames(totals_within)[9] <- "totwithin85plus"

## Total for entire population (by gender and housing)
totals_within$totwithin_allage <- totals_within$totwithinupto15 + totals_within$totwithin1529 + totals_within$totwithin3049 + totals_within$totwithin5064 + totals_within$totwithin6584 + totals_within$totwithin85plus

### TOTALS by gender and mstat (housing collapsed)
totals_mstat <- aggregate(cbind(hyper_mstat2$ageupto15,
                                 hyper_mstat2$age1529,
                                 hyper_mstat2$age3049,
                                 hyper_mstat2$age5064,
                                 hyper_mstat2$age6584,
                                 hyper_mstat2$age85plus) ~ hyper_mstat2$cntry + hyper_mstat2$gender + hyper_mstat2$mstat, data = hyper_mstat2, FUN = function(x) c(gama = sum(x)))

## renaming variables
colnames(totals_mstat)[1] <- "cntry"
colnames(totals_mstat)[2] <- "gender"
colnames(totals_mstat)[3] <- "mstat"
colnames(totals_mstat)[4] <- "totmstatupto15"
colnames(totals_mstat)[5] <- "totmstat1529"
colnames(totals_mstat)[6] <- "totmstat3049"
colnames(totals_mstat)[7] <- "totmstat5064"
colnames(totals_mstat)[8] <- "totmstat6584"
colnames(totals_mstat)[9] <- "totmstat85plus"

# generating a data frame with information from the three data frames
mstat_fin1 <- merge(hyper_mstat2, totals, by=c("cntry","gender"))                   # Step 1
mstat_fin2 <- merge(mstat_fin1, totals_mstat, by=c("cntry","gender", "mstat"))      # Step 2
mstat_final <- merge(mstat_fin2, totals_within, by=c("cntry","gender", "housing"))  # Step 3


############################
## CALCULATING SHARES
############################

# calculation: Share within age & gender cohorts, across all types of housing 
mstat_final$shareupto15 <- round((mstat_final$ageupto15/mstat_final$totalupto15), digits = 4)
mstat_final$share1529 <- round((mstat_final$age1529/mstat_final$total1529), digits = 4)
mstat_final$share3049 <- round((mstat_final$age3049/mstat_final$total3049), digits = 4)
mstat_final$share5064 <- round((mstat_final$age5064/mstat_final$total5064), digits = 4)
mstat_final$share6584 <- round((mstat_final$age6584/mstat_final$total6584), digits = 4)
mstat_final$share85plus <- round((mstat_final$age85plus/mstat_final$total85plus), digits = 4)

# calculation: Share within age & gender cohorts, types of housing separated 
mstat_final$sharewithinupto15 <- round((mstat_final$ageupto15/mstat_final$totwithinupto15), digits = 4)
mstat_final$sharewithin1529 <- round((mstat_final$age1529/mstat_final$totwithin1529), digits = 4)
mstat_final$sharewithin3049 <- round((mstat_final$age3049/mstat_final$totwithin3049), digits = 4)
mstat_final$sharewithin5064 <- round((mstat_final$age5064/mstat_final$totwithin5064), digits = 4)
mstat_final$sharewithin6584 <- round((mstat_final$age6584/mstat_final$totwithin6584), digits = 4)
mstat_final$sharewithin85plus <- round((mstat_final$age85plus/mstat_final$totwithin85plus), digits = 4)

# calculation: Total share within age & gender cohorts, types of housing collapsed (for DID) 
mstat_final$sharemstatupto15 <- round((mstat_final$totmstatupto15/mstat_final$totalupto15), digits = 4)
mstat_final$sharemstat1529 <- round((mstat_final$totmstat1529/mstat_final$total1529), digits = 4)
mstat_final$sharemstat3049 <- round((mstat_final$totmstat3049/mstat_final$total3049), digits = 4)
mstat_final$sharemstat5064 <- round((mstat_final$totmstat5064/mstat_final$total5064), digits = 4)
mstat_final$sharemstat6584 <- round((mstat_final$totmstat6584/mstat_final$total6584), digits = 4)
mstat_final$sharemstat85plus <- round((mstat_final$totmstat85plus/mstat_final$total85plus), digits = 4)

# Collapsing GENDERS for joint DID
### 1a: collapsing genders while leaving housing and mstat separate 
mstat_did_1a <- aggregate(cbind(ageupto15,
                                  age1529,
                                  age3049,
                                  age5064,
                                  age6584,
                                  age85plus) ~ cntry + mstat + housing, data = mstat_final, FUN = function(x) c(gama = sum(x)))

### 1b: collapsing genders and mstat while leaving housing separate
mstat_did_1b <- aggregate(cbind(ageupto15,
                                  age1529,
                                  age3049,
                                  age5064,
                                  age6584,
                                  age85plus) ~ cntry + housing, data = mstat_final, FUN = function(x) c(gama = sum(x)))

colnames(mstat_did_1b)[3] <- "totwithinupto15_both"
colnames(mstat_did_1b)[4] <- "totwithin1529_both"
colnames(mstat_did_1b)[5] <- "totwithin3049_both"
colnames(mstat_did_1b)[6] <- "totwithin5064_both"
colnames(mstat_did_1b)[7] <- "totwithin6584_both"
colnames(mstat_did_1b)[8] <- "totwithin85plus_both"

### 2a: Totals for mstat, housing collapsed 
mstat_tot_both <- aggregate(cbind(ageupto15,
                                age1529,
                                age3049,
                                age5064,
                                age6584,
                                age85plus) ~ cntry + mstat, data = mstat_final, FUN = function(x) c(gama = sum(x)))

colnames(mstat_tot_both)[3] <- "totmstatupto15_both"
colnames(mstat_tot_both)[4] <- "totmstat1529_both"
colnames(mstat_tot_both)[5] <- "totmstat3049_both"
colnames(mstat_tot_both)[6] <- "totmstat5064_both"
colnames(mstat_tot_both)[7] <- "totmstat6584_both"
colnames(mstat_tot_both)[8] <- "totmstat85plus_both"

### 2b: Calculating the total value of residents in respective age cohorts
totals_both <- aggregate(cbind(totalupto15,
                               total1529,
                               total3049,
                               total5064,
                               total6584,
                               total85plus) ~ cntry, data = totals, FUN = function(x) c(gama = sum(x)))

# generating a data frame with information from the three data frames
mstat_temp1 <- merge(mstat_did_1a, mstat_did_1b, by=c("cntry","housing"))   # Step 1
mstat_temp2 <- merge(mstat_temp1, mstat_tot_both, by=c("cntry","mstat"))    # Step 2
mstat_did_both <- merge(mstat_temp2, totals_both, by=c("cntry"))            # Step 3

## calculating the share of mstat within age cohorts / types of housing
mstat_did_both$share_mstatupto15 <- round((mstat_did_both$ageupto15/mstat_did_both$totwithinupto15_both), digits = 4)
mstat_did_both$share_mstat1529 <- round((mstat_did_both$age1529/mstat_did_both$totwithin1529_both), digits = 4)
mstat_did_both$share_mstat3049 <- round((mstat_did_both$age3049/mstat_did_both$totwithin3049_both), digits = 4)
mstat_did_both$share_mstat5064 <- round((mstat_did_both$age5064/mstat_did_both$totwithin5064_both), digits = 4)
mstat_did_both$share_mstat6584 <- round((mstat_did_both$age6584/mstat_did_both$totwithin6584_both), digits = 4)
mstat_did_both$share_mstat85plus <- round((mstat_did_both$age85plus/mstat_did_both$totwithin85plus_both), digits = 4)

# calculation: Total share of mstat within age cohorts, gender & types of housing collapsed (for DID) 
mstat_did_both$fullshareupto15 <- round((mstat_did_both$totmstatupto15_both/mstat_did_both$totalupto15), digits = 4)
mstat_did_both$fullshare1529 <- round((mstat_did_both$totmstat1529_both/mstat_did_both$total1529), digits = 4)
mstat_did_both$fullshare3049 <- round((mstat_did_both$totmstat3049_both/mstat_did_both$total3049), digits = 4)
mstat_did_both$fullshare5064 <- round((mstat_did_both$totmstat5064_both/mstat_did_both$total5064), digits = 4)
mstat_did_both$fullshare6584 <- round((mstat_did_both$totmstat6584_both/mstat_did_both$total6584), digits = 4)
mstat_did_both$fullshare85plus <- round((mstat_did_both$totmstat85plus_both/mstat_did_both$total85plus), digits = 4)

## cleaning up
rm("mstat_fin1", "mstat_fin2", "mstat_did_1a", "mstat_did_1b", "mstat_temp1", "mstat_temp2")

### Calculating Duncan's Index of Dissimilarity (DID) -> SEPARATELY for the 2 genders

# 1) Subtract share in private households from total population
mstat_final$duncan_mstatupto15  <- abs(mstat_final$sharemstatupto15-mstat_final$sharewithinupto15)
mstat_final$duncan_mstat1529    <- abs(mstat_final$sharemstat1529-mstat_final$sharewithin1529)
mstat_final$duncan_mstat3049    <- abs(mstat_final$sharemstat3049-mstat_final$sharewithin3049)
mstat_final$duncan_mstat5064    <- abs(mstat_final$sharemstat5064-mstat_final$sharewithin5064) 
mstat_final$duncan_mstat6584    <- abs(mstat_final$sharemstat6584-mstat_final$sharewithin6584)   
mstat_final$duncan_mstat85plus  <- abs(mstat_final$sharemstat85plus-mstat_final$sharewithin85plus)   

# 2) Calculate sum over all categories of the variable
duncan_mstat <- aggregate(cbind(duncan_mstatupto15,
                                duncan_mstat1529,
                                duncan_mstat3049,
                                duncan_mstat5064,
                                duncan_mstat6584,
                                duncan_mstat85plus) ~ cntry + gender + housing, data = mstat_final, FUN = function(x) c(gama = sum(x)))

# 3) Divide by 2 
duncan_mstat$duncan_mstatupto15 <- (duncan_mstat$duncan_mstatupto15)/2
duncan_mstat$duncan_mstat1529   <- (duncan_mstat$duncan_mstat1529)/2
duncan_mstat$duncan_mstat3049   <- (duncan_mstat$duncan_mstat3049)/2
duncan_mstat$duncan_mstat5064   <- (duncan_mstat$duncan_mstat5064)/2
duncan_mstat$duncan_mstat6584   <- (duncan_mstat$duncan_mstat6584)/2
duncan_mstat$duncan_mstat85plus <- (duncan_mstat$duncan_mstat85plus)/2

### Calculating Duncan's Index of Dissimilarity (DID) -> for the 2 genders COMBINED

# 1) Subtract share in private households from total population
mstat_did_both$duncboth_mstatupto15  <- abs(mstat_did_both$share_mstatupto15 - mstat_did_both$fullshareupto15)
mstat_did_both$duncboth_mstat1529    <- abs(mstat_did_both$share_mstat1529 - mstat_did_both$fullshare1529)
mstat_did_both$duncboth_mstat3049    <- abs(mstat_did_both$share_mstat3049 - mstat_did_both$fullshare3049)
mstat_did_both$duncboth_mstat5064    <- abs(mstat_did_both$share_mstat5064 - mstat_did_both$fullshare5064) 
mstat_did_both$duncboth_mstat6584    <- abs(mstat_did_both$share_mstat6584 - mstat_did_both$fullshare6584)   
mstat_did_both$duncboth_mstat85plus  <- abs(mstat_did_both$share_mstat85plus - mstat_did_both$fullshare85plus)   

# 2) Calculate sum over all categories of the variable
duncboth_mstat <- aggregate(cbind(duncboth_mstatupto15,
                                  duncboth_mstat1529,
                                  duncboth_mstat3049,
                                  duncboth_mstat5064,
                                  duncboth_mstat6584,
                                  duncboth_mstat85plus) ~ cntry + housing, data = mstat_did_both, FUN = function(x) c(gama = sum(x)))

# 3) Divide by 2 
duncboth_mstat$duncboth_mstatupto15 <- (duncboth_mstat$duncboth_mstatupto15)/2
duncboth_mstat$duncboth_mstat1529   <- (duncboth_mstat$duncboth_mstat1529)/2
duncboth_mstat$duncboth_mstat3049   <- (duncboth_mstat$duncboth_mstat3049)/2
duncboth_mstat$duncboth_mstat5064   <- (duncboth_mstat$duncboth_mstat5064)/2
duncboth_mstat$duncboth_mstat6584   <- (duncboth_mstat$duncboth_mstat6584)/2
duncboth_mstat$duncboth_mstat85plus <- (duncboth_mstat$duncboth_mstat85plus)/2


##############################
### PREPARING VISUALIZATION 
##############################

duncan_mstat_phh <- subset(duncboth_mstat, housing == 1)
duncan_mstat_phh <- duncan_mstat_phh[-c(2)]

duncan_mstat_phhboth <- melt(duncan_mstat_phh, id.vars = "cntry")

duncan_mstat_phhboth$age <- 0
duncan_mstat_phhboth$age[duncan_mstat_phhboth$variable == "duncboth_mstatupto15"] <- 7.5
duncan_mstat_phhboth$age[duncan_mstat_phhboth$variable == "duncboth_mstat1529"] <- 22
duncan_mstat_phhboth$age[duncan_mstat_phhboth$variable == "duncboth_mstat3049"] <- 39.5
duncan_mstat_phhboth$age[duncan_mstat_phhboth$variable == "duncboth_mstat5064"] <-57
duncan_mstat_phhboth$age[duncan_mstat_phhboth$variable == "duncboth_mstat6584"] <- 74.5
duncan_mstat_phhboth$age[duncan_mstat_phhboth$variable == "duncboth_mstat85plus"] <- 92.5

colnames(duncan_mstat_phhboth)[1] <- "Countries"

### create identifier of types of countries (by the relative size of the institutionalized population)

duncan_mstat_phhboth$type <- 2

duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Austria"] <- 1 
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Belgium"] <- 1 
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Cyprus"] <- 1 
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Germany"] <- 1 
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Spain"] <- 1 
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Finland"] <- 1 
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Croatia"] <- 1 
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Ireland"] <- 1 
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Iceland"] <- 1 
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Italy"] <- 1 
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Norway"] <- 1 
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Portugal"] <- 1  
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Slovakia"] <- 1 

duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Bulgaria"] <- 3
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Greece"] <- 3
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Latvia"] <- 3
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Lithuania"] <- 3
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Poland"] <- 3
duncan_mstat_phhboth$type[duncan_mstat_phhboth$Countries == "Romania"] <- 3

duncan_mstat_phhboth$type <- as.factor(duncan_mstat_phhboth$type)


### Preparing mstat_final for visualization
mstat_final$mstat <- as.character(mstat_final$mstat)

mstat_final$housing_gender <- c("Blank")
mstat_final$housing_gender[mstat_final$gender == 0 & mstat_final$housing == 1] <- "M. p.hh."
mstat_final$housing_gender[mstat_final$gender == 1 & mstat_final$housing == 1] <- "F. p.hh."
mstat_final$housing_gender[mstat_final$gender == 0 & mstat_final$housing == 2] <- "M. inst."
mstat_final$housing_gender[mstat_final$gender == 1 & mstat_final$housing == 2] <- "F. inst."

### Saving data frames 
setwd(save.path)

save(mstat_final, file = "mstat_final.Rda")
save(duncan_mstat_phhboth, file = "duncan_mstat_phhboth.Rda")


### Function to extract legend
get_legend <- function(myggplot){
  tmp <- ggplot_gtable(ggplot_build(myggplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}


# Fixing error in ggmosiac
is.formula <- function (x) inherits(x, "formula")

is.discrete <- function(x) {
  is.factor(x) || is.character(x) || is.logical(x)
}

product_names <- function() {
  function(x) {
    #cat(" in product_breaks\n")
    #browser()
    unique(x)
  }
}

product_breaks <- function() {
  function(x) {
    #cat(" in product_breaks\n")
    #browser()
    unique(x)
  }
}

product_labels <- function() {
  function(x) {
    #cat(" in product_labels\n")
    #browser()
    
    unique(x)
  }
}

is.waive <- getFromNamespace("is.waive", "ggplot2")

#' Helper function for determining scales
#'
#' Used internally to determine class of variable x
#' @param x variable
#' @return character string "productlist"
#' @importFrom ggplot2 scale_type
#' @export
scale_type.productlist <- function(x) {
  #  cat("checking for type productlist\n")
  #browser()
  "productlist"
}

#' Determining scales for mosaics
#'
#' @param name set to pseudo waiver function `product_names` by default.
#' @inheritParams ggplot2::continuous_scale
#' @export
scale_x_productlist <- function(name = ggplot2::waiver(), breaks = product_breaks(),
                                minor_breaks = NULL, labels = product_labels(),
                                limits = NULL, expand = ggplot2::waiver(), oob = scales:::censor,
                                na.value = NA_real_, trans = "identity",
                                position = "bottom", sec.axis = ggplot2::waiver()) {
  #browser()
  sc <- ggplot2::continuous_scale(
    c("x", "xmin", "xmax", "xend", "xintercept", "xmin_final", "xmax_final", "xlower", "xmiddle", "xupper"),
    "position_c", identity, name = name, breaks = breaks,
    minor_breaks = minor_breaks, labels = labels, limits = limits,
    expand = expand, oob = oob, na.value = na.value, trans = trans,
    guide = ggplot2::waiver(), position = position, super = ScaleContinuousProduct
  )
  
  
  if (!is.waive(sec.axis)) {
    if (is.formula(sec.axis)) sec.axis <- ggplot2::sec_axis(sec.axis)
    is.sec_axis = getFromNamespace("is.sec_axis", "ggplot2")
    if (is.sec_axis(sec.axis)) stop("Secondary axes must be specified using 'sec_axis()'")
    sc$secondary.axis <- sec.axis
  }
  sc
}

#' @rdname scale_x_productlist
#' @param sec.axis specify a secondary axis
#' @export
scale_y_productlist <- function(name = ggplot2::waiver(), breaks = product_breaks(),
                                minor_breaks = NULL, labels = product_labels(),
                                limits = NULL, expand = ggplot2::waiver(), oob = scales:::censor,
                                na.value = NA_real_, trans = "identity",
                                position = "left", sec.axis = ggplot2::waiver()) {
  #browser()
  sc <- ggplot2::continuous_scale(
    c("y", "ymin", "ymax", "yend", "yintercept", "ymin_final", "ymax_final", "ylower", "ymiddle", "yupper"),
    "position_c", identity, name = name, breaks = breaks,
    minor_breaks = minor_breaks, labels = labels, limits = limits,
    expand = expand, oob = oob, na.value = na.value, trans = trans,
    guide = ggplot2::waiver(), position = position, super = ScaleContinuousProduct
  )
  
  if (!is.waive(sec.axis)) {
    if (is.formula(sec.axis)) sec.axis <- ggplot2::sec_axis(sec.axis)
    is.sec_axis = getFromNamespace("is.sec_axis", "ggplot2")
    if (is.sec_axis(sec.axis)) stop("Secondary axes must be specified using 'sec_axis()'")
    sc$secondary.axis <- sec.axis
  }
  sc
}

#' @rdname scale_x_productlist
#' @export
ScaleContinuousProduct <- ggproto(
  "ScaleContinuousProduct", ScaleContinuousPosition,
  train =function(self, x) {
    #cat("train in ScaleContinuousProduct\n")
    #cat("class of variable: ")
    #cat(class(x))
    #browser()
    if (is.list(x)) {
      x <- x[[1]]
      if ("Scale" %in% class(x)) {
        #browser()
        # re-assign the scale values now that we have the information - but only if necessary
        if (is.function(self$breaks)) self$breaks <- x$breaks
        if (is.function(self$labels)) self$labels <- x$labels
        if (is.waive(self$name)) {
          self$product_name <- gsub("x__alpha__", "", x$name)
          self$product_name <- gsub("x__fill__", "", self$product_name)
          self$product_name <- gsub("x__", "", self$product_name)
          self$product_name <- gsub("conds\\d__", "", self$product_name)
        }
        #cat("\n")
        return()
      }
    }
    if (is.discrete(x)) {
      self$range$train(x=c(0,1))
      #cat("\n")
      return()
    }
    self$range$train(x)
    #cat("\n")
  },
  map = function(self, x, limits = self$get_limits()) {
    #cat("map in ScaleContinuousProduct\n")
    #browser()
    if (is.discrete(x)) return(x)
    if (is.list(x)) return(0) # need a number
    scaled <- as.numeric(self$oob(x, limits))
    ifelse(!is.na(scaled), scaled, self$na.value)
  },
  dimension = function(self, expand = c(0, 0)) {
    #cat("dimension in ScaleContinuousProduct\n")
    c(-0.05,1.05)
  },
  make_title = function(title, self) {
    #browser()
    if(title %in% self$aesthetics){
      title <- self$product_name
    }
    else title
  }
)


#------------------------------------------------#
#      EXAMPLES FOR VISUALIZATION
#------------------------------------------------#

### 1. Duncan plots for both genders
theme_set(theme_light()) 
theme_replace(text = element_text(size=10), legend.position = "bottom")

### BOTH GENDERS              
fig_duncan_mstat <- ggplot(duncan_mstat_phhboth, aes(x = age, y = value, group = type)) + 
  geom_point(aes(shape=type), size=3) +
  scale_y_continuous(name="", limits=c(0, 0.075), labels = scales::percent_format(accuracy = 1L)) + 
  scale_x_continuous(breaks=c(7.5, 22, 39.5, 57, 74.5, 92.5), labels=c("<15", "15-29", "30-49", "50-64", "65-84", "85+"), name = NULL) +
  scale_shape_discrete(name = "Groups of countries by relative size of inst. pop.", labels = c("One-peak country", "Two-peak country", "Residual country")) +
  #geom_text(aes(label=ifelse(value>0.035,as.character(Countries),'')),hjust=1.2,vjust=0)
  ggtitle("Duncan's index of dissimilarity for marital status") + theme(plot.title = element_text(hjust = 0.5)) 
  #ggsave("srm-7830_figure-3_example.pdf", fig_duncan_mstat)

fig_duncan_mstat


### 2. Mosaic plot for single country or specific age groups

# Selecting a single country & the two main types of housing
mstat_EU <- subset(mstat_final, cntry=="Europe" & (housing == 1 | housing == 2))  

## extracting a legend first -> only done once
mstat_6584_b <-  ggplot(mstat_EU, aes(y=housing_gender, x=sharewithin6584, fill=mstat)) +
  geom_col() +
  scale_fill_manual(values = alpha(c("gray20", "gray40", "gray60", "gray80", "black"), 0.85), name = "Marital status", labels = c("Never married", "Married", "Widowed", "Divorced", "Not stated")) +
  scale_y_discrete(name="") +
  scale_x_continuous(name="", limits=c(0, 1.01), labels = scales::percent) +
  ggtitle("65 to 84 years") + theme(plot.title = element_text(hjust = 0.5, size = 1)) + 
  theme_light()
## generating and saving a legend for one plot
mstat_6584_b <- mstat_6584_b + theme(legend.position="bottom", legend.text = element_text(colour="gray20", size=8))
legend_mstat <- get_legend(mstat_6584_b)


### producing plots for specific age groups
mstat3049 <- ggplot(data = mstat_EU) +
  geom_mosaic(aes(weight = age3049, x = product(housing_gender), fill = mstat), na.rm=TRUE, show.legend = FALSE)  +
  scale_y_continuous(name="", limits=c(0, 1), labels = scales::percent) +
  scale_fill_manual(values = alpha(c("gray20","gray40", "gray60", "gray80", "black"), 0.85), name = "Marital status", labels = c("Never married", "Married", "Widowed", "Divorced", "Not stated")) +
  #facet_grid(gender~.) +
  labs(x ="") +
  ggtitle("30 to 49 years") + theme(plot.title = element_text(hjust = 0.5)) + 
  theme_light() 

mstat5064 <- ggplot(data = mstat_EU) +
  geom_mosaic(aes(weight = age5064, x = product(housing_gender), fill = mstat), na.rm=TRUE, show.legend = FALSE)  +
  scale_y_continuous(name="", limits=c(0, 1), labels = scales::percent) +
  scale_fill_manual(values = alpha(c("gray20","gray40", "gray60", "gray80", "black"), 0.85), name = "Marital status", labels = c("Never married", "Married", "Widowed", "Divorced", "Not stated")) +
  #facet_grid(gender~.) +
  labs(x ="") +
  ggtitle("50 to 64 years") + theme(plot.title = element_text(hjust = 0.5)) + 
  theme_light() 


mstat6584 <- ggplot(data = mstat_EU) +
  geom_mosaic(aes(weight = age6584, x = product(housing_gender), fill = mstat), na.rm=TRUE, show.legend = FALSE)  +
  scale_y_continuous(name="", limits=c(0, 1), labels = scales::percent) +
  scale_fill_manual(values = alpha(c("gray20","gray40", "gray60", "gray80", "black"), 0.85), name = "Marital status", labels = c("Never married", "Married", "Widowed", "Divorced", "Not stated")) +
  #facet_grid(gender~.) +
  labs(x ="") +
  ggtitle("65 to 84 years") + theme(plot.title = element_text(hjust = 0.5)) + 
  theme_light() 


mstat85plus <- ggplot(data = mstat_EU) +
  geom_mosaic(aes(weight = age85plus, x = product(housing_gender), fill = mstat), na.rm=TRUE, show.legend = FALSE)  +
  scale_y_continuous(name="", limits=c(0, 1), labels = scales::percent) +
  scale_fill_manual(values = alpha(c("gray20","gray40", "gray60", "gray80", "black"), 0.85), name = "Marital status", labels = c("Never married", "Married", "Widowed", "Divorced", "Not stated")) +
  #facet_grid(gender~.) +
  labs(x ="") +
  ggtitle("85 years and older") + theme(plot.title = element_text(hjust = 0.5)) + 
  theme_light() 


fig_mosaic_mstat <- grid.arrange(mstat3049, mstat5064, mstat6584, mstat85plus, 
                                 legend_mstat, nrow = 3, ncol = 2,
                                 layout_matrix = rbind(c(1,2), c(3,4), c(5,5)),
                                 widths = c(5, 5), heights = c(3, 3, 0.5))

## Plotting the figure
fig_mosaic_mstat
