# Note: file locations and pathways may be subject to change. 

##### Note on the samples ##### 
# We use two samples: 
# 1) NL 2020 HBS sample, referred to as 2020 web-based sample in thesis

# And the app-based samples (2021)
# 2) NL 

# We combine the 2021 app-based samples to make combined datasets

##### Note on quality indicators #####
# We calculate 6 quality indicators: entries; amount range; amount sd/sd amount; products range; products sd/sd products; store types 
# These can have different names in the Scripts/generated data files (both this script and Analyses.R): 
  # entries = entries
  # amount range = amountvar
  # amount sd/sd amount = sdamount 
  # products range = nprodvar
  # products sd/sd products = sdnprod
  # store types = ncat -> we use expected number of store types in the thesis, called catexpected in the script/generated data files

# The indicators are calculated for week 1 (days 1-7) and overall for all 4 samples, and additionally for week 2 for the 2021 samples 

##### Libraries #####
# Loading data: 
library(haven) # Load SPSS files
library(readxl) # Load Excel files

# Data manipulation: 
library(dplyr) # Easy data manipulation
library(data.table) # Easy data manipulation

###### Loading the data ###### 
## 2020 (web-based) ##
# Expenditure data: # 
nl_exp_2020 <- as.data.frame(read_sav("W:/HBS veldtest 2021/Evelien/HBS2021/Data files/P20.6.BO2020_HHBoekPosten_GeldigePop26-10-2021.sav")) # Expenditure data from households that completed the 2020 HBS 
nl_exp_2020 <- subset(nl_exp_2020, 
                      Vragenlijst == "AU") # Subset only expenses from AU week (only week all expenses were reported), because only these expenses will be used for data quality analyses

# Registry data: #
nl_pop_2020 <- as.data.frame(read_sav("W:/HBS veldtest 2021/Evelien/HBS2021/Data files/S4.BO2020_GeldigePopulatie_verrijking_2.sav")) # Registry data for households that completed the 2020 HBS
nl_pop_2020 <- nl_pop_2020[,c("UserID",      # Select only the variables from registry data that will be used in analyses: 
                              "HK2_LFT",     # Age
                              "AHL",         # Household size
                              "hk2_hla2",    # Origin
                              "WoningBezit", # Homeownership
                              "H10_ges",     # Income in standardized percentiles
                              "STEDGEM")]    # Urbanization of the municipality 


#  Weights: # 
# Load files required to calculate the correction weights: 
(load("W:/HBS veldtest 2021/Evelien/HBS2021/Data files/postenpu2.RData")) # Load inclusion weights 
inclusion_2020 <- postenpu[,c("UserID", "iwPU")] # Select UserID and inclusion weight 
rm(postenpu) # Remove from workspace
final_2020 <- as.data.frame(read.csv2("W:/HBS veldtest 2021/Evelien/HBS2021/Data files/gewichtenPU_15_12_csv2.csv")) # Load adjustment weights
correction_2020 <- merge(inclusion_2020, final_2020, by = "UserID") # Merge inclusion and adjustment weights in a file to be able to calculate correction weights 

# Calculate the correction weights: 
correction_2020$corw <- correction_2020$eindw/correction_2020$iwPU # Calculate correction weight by dividing adjustment weight by inclusion weight 
hist(correction_2020$corw) # Plot histogram of the correction weights to check the distribution. 
correction_2020$corw[correction_2020$corw > 3] <- 3 # Set correction weights larger than 3 equal to 3 (because weights larger than 3 are too large)
correction_2020$resp_prob <- (10420/78917)*correction_2020$corw # Calculate response probability by multiplying response rate and correction weight  
rm(final_2020) # Remove from workspace

write.csv(correction_2020, "W:/HBS veldtest 2021/Evelien/HBS2021/Data files/correction_2020.csv") # Save correction weights as csv file 

## 2021 ##
# Expenditure data: # 
nl_resp_2021 <- as.data.frame(read.csv("W:/HBS veldtest 2021/Evelien/HBS2021/Data files/NLrespanalysis.csv")) # Load the 2021 complete data (this file contains expenditures at the household level, response, registry, experimental conditions)
nl_transplus <- as.data.frame(read.csv("W:/HBS veldtest 2021/Evelien/HBS2021/Data files/NLtransplus.csv")) # Load 2021 detailed expenditure data (this file contains expenditures at the entry level)

# Registry data: # 
nl_pop_2021 <- as.data.frame(read_sav("W:/HBS veldtest 2021/Evelien/HBS2021/Data files/BOX21.sav")) # Load 2021 all registry data  
nl_pop_2021 <- nl_pop_2021[,c("Veilignummer",    # Select variables that are now missing from 2021 complete data: 
                              "VRLAANTALPERSHH", # Household size; 
                              "ETNGROEPKORT1",   # Origin; 
                              "STEDGEM")]        # Urbanization of the municipality 


nl_pop_2021 <- rename(nl_pop_2021, veilignummer = Veilignummer) # Rename veilignummer in population data to match variable name in the complete data file (nl_resp_2021)
nl_resp_2021 <- merge(nl_resp_2021, nl_pop_2021, by = "veilignummer") # Add extra registry variables to complete data

# Adding variables: #
nl_resp_2021$country <- "1. NL" # Create country variable

nl_resp_2021$dropout <- NA # Create dropout variable 
nl_resp_2021$dropout[nl_resp_2021$ndays < 15] <- 1 # Dropout is set to 1 if last activity seen before 14 days (dropout = yes)
nl_resp_2021$dropout[nl_resp_2021$ndays >= 15] <- 0 # No more dropout if last activity is seen at 14 days or later (right censored) (dropout = no)

nl_resp_2021$ndays_rounded <- NA # Add rounded ndays variable to use as moment of dropout
nl_resp_2021$ndays_rounded <- round(nl_resp_2021$ndays, digits = 0) # Round to full digit, rounding up from .5 

nl_resp_2021$studylength <- 14 # Add study length variable (= 14 days in NL)

nl_resp_2021$insampleF2F <- TRUE # This variable is not important for the NL data, but will need this variable when combining with Spain data

# Recode experimental conditions: # 
nl_resp_2021$insights[nl_resp_2021$insights == 2] <- 0 # Recode to match conditions to other countries. Now, immediate insights = 1 and delayed insights = 0 

###### Recode auxiliary data ###### 
## 2020 ## 
## Urbanization 
table(nl_pop_2020$STEDGEM) # Inspecting the levels of the variable. 
nl_pop_2020$urban <- nl_pop_2020$STEDGEM # Create new urbanization variable so the original one is still available after recoding
# Recoding: 
nl_pop_2020$urban[nl_pop_2020$urban == 5] <- 4 # Combine little & not to 1 group ( <1000 addresses/km^2)

## Age 
table(nl_pop_2020$HK2_LFT) # Inspecting the levels of the variable
nl_pop_2020$age <- NA # Create new age variable so that the original one is still available after recoding
# Recode to character instead of numeric: 
nl_pop_2020$age[nl_pop_2020$HK2_LFT == 1] <- "1. 18-24"
nl_pop_2020$age[nl_pop_2020$HK2_LFT == 2 | nl_pop_2020$HK2_LFT == 3] <- "2. 25-34"
nl_pop_2020$age[nl_pop_2020$HK2_LFT == 4 | nl_pop_2020$HK2_LFT == 5] <- "3. 35-44"
nl_pop_2020$age[nl_pop_2020$HK2_LFT == 6 | nl_pop_2020$HK2_LFT == 7] <- "4. 45-54"
nl_pop_2020$age[nl_pop_2020$HK2_LFT == 8 | nl_pop_2020$HK2_LFT == 9] <- "5. 55-64"
nl_pop_2020$age[nl_pop_2020$HK2_LFT == 10 | nl_pop_2020$HK2_LFT == 11] <- "6. 65-74"
nl_pop_2020$age[nl_pop_2020$HK2_LFT >= 12] <- "7. >=75"

## Income 
table(nl_pop_2020$H10_ges) # Inspecting the levels of the variable
nl_pop_2020$income <- NA # Create new income variable so that the original one is still available after recoding
# Recode to character instead of numeric:  
nl_pop_2020$income[nl_pop_2020$H10_ges == 1 | nl_pop_2020$H10_ges == 2] <- "1. 0-20% perc"
nl_pop_2020$income[nl_pop_2020$H10_ges == 3 | nl_pop_2020$H10_ges == 4] <- "2. 20-40% perc"
nl_pop_2020$income[nl_pop_2020$H10_ges == 5 | nl_pop_2020$H10_ges == 6] <- "3. 40-60% perc"
nl_pop_2020$income[nl_pop_2020$H10_ges == 7 | nl_pop_2020$H10_ges == 8] <- "4. 60-80% perc"
nl_pop_2020$income[nl_pop_2020$H10_ges == 9 | nl_pop_2020$H10_ges == 10] <- "5. 80-100% perc"

## Household size 
table(nl_pop_2020$AHL) # Inspecting the levels of the variable 
nl_pop_2020$hhsize <- NA # Create new householdsize variable so that the original one is still available after recoding 
# Recode to character instead of numeric:  
nl_pop_2020$hhsize[nl_pop_2020$AHL == 1] <- "1. 1 person"
nl_pop_2020$hhsize[nl_pop_2020$AHL == 2] <- "2. 2 persons"
nl_pop_2020$hhsize[nl_pop_2020$AHL == 3] <- "3. 3 persons"
nl_pop_2020$hhsize[nl_pop_2020$AHL >= 4] <- "4 or more persons"

## Homeownership 
table(nl_pop_2020$WoningBezit) #Inspecting the levels of the variable
nl_pop_2020$homeownership <- NA # Create new homeownership variable so that the original one is still available after recoding 
# Recode into binary character variable instead of numeric: 
nl_pop_2020$homeownership[nl_pop_2020$WoningBezit == 1] <- "Eigenaar woont in de woning" # Owner lives in house
nl_pop_2020$homeownership[nl_pop_2020$WoningBezit >= 2] <- "Huur" # Household is renting the house. Rent also encompasses living somewhere for free and 'other'

## Origin
table(nl_pop_2020$hk2_hla2) # Inspecting the levels of the variable 
nl_pop_2020$origin <- NA # Create new origin variable so that the original one is still available after recoding
# Recode into character instead of numeric: 
nl_pop_2020$origin[nl_pop_2020$hk2_hla2 == 1] <- "1. Dutch"
nl_pop_2020$origin[nl_pop_2020$hk2_hla2 == 3] <- "2. Non-western immigrant"
nl_pop_2020$origin[nl_pop_2020$hk2_hla2 == 2] <- "3. Western immigrant"


nl_pop_2020_w <- merge(nl_pop_2020, correction_2020, by = "UserID") # Merge registry data and correction weights

## 2021 ##
## Urbanization 
table(nl_resp_2021$STEDGEM) # Inspecting the levels of the variable. 
nl_resp_2021$urban <- nl_resp_2021$STEDGEM # Create new urbanization variable so the original one is still available after recoding 
nl_resp_2021$urban[nl_resp_2021$urban == 5] <- 4 # Combine little & not to 1 group ( <1000 addresses km/2)

## Age
table(nl_resp_2021$Leeftijd) # Inspecting the levels of the variable
nl_resp_2021$age <- NA # Create new age variable so that the original one is still available after recoding 
# Recode character levels to be less detailed: 
nl_resp_2021$age[nl_resp_2021$Leeftijd == "04. 18-24"] <- "1. 18-24" # Make sure this is reference category (because this is the lowest age class) 
nl_resp_2021$age[nl_resp_2021$Leeftijd == "05. 25-29" | nl_resp_2021$Leeftijd == "06. 30-34"] <- "2. 25-34"
nl_resp_2021$age[nl_resp_2021$Leeftijd == "07. 35-39" | nl_resp_2021$Leeftijd == "08. 40-44"] <- "3. 35-44"
nl_resp_2021$age[nl_resp_2021$Leeftijd == "09. 45-49" | nl_resp_2021$Leeftijd == "10. 50-54"] <- "4. 45-54"
nl_resp_2021$age[nl_resp_2021$Leeftijd == "11. 55-59" | nl_resp_2021$Leeftijd == "12. 60-64"] <- "5. 55-64"
nl_resp_2021$age[nl_resp_2021$Leeftijd == "13. 65-69" | nl_resp_2021$Leeftijd == "14. 70-74"] <- "6. 65-74"
nl_resp_2021$age[nl_resp_2021$Leeftijd == "15. >=75"] <- "7. >=75"

## Income 
table(nl_resp_2021$InkomenHH) #  # Inspecting the levels of the variable
nl_resp_2021$income <- nl_resp_2021$InkomenHH # Create new income variable so that the original one is still available after recoding
nl_resp_2021$income[nl_resp_2021$income == "0. Missing"] <- "6. Missing" # Change 0 to 6 so 'missing' will not be used as reference category (now 0-20 will be used as reference category) 

## Household size
table(nl_resp_2021$VRLAANTALPERSHH) #  # Inspecting the levels of the variable
nl_resp_2021 <- rename(nl_resp_2021, hhsize = VRLAANTALPERSHH) # Rename household size variable to match in all three countries
nl_resp_2021$hhsize[nl_resp_2021$hhsize > 4 ] <- 4 # Recode any households with 4 or more members to be in the same group 
nl_resp_2021$hhsize <- as.character(nl_resp_2021$hhsize) # Recode to character (because variable is now numeric)
# Rename levels of the variable: 
nl_resp_2021$hhsize[nl_resp_2021$hhsize== "1"] <- "1. 1 person"
nl_resp_2021$hhsize[nl_resp_2021$hhsize== "2"] <- "2. 2 persons"
nl_resp_2021$hhsize[nl_resp_2021$hhsize== "3"] <- "3. 3 persons"
nl_resp_2021$hhsize[nl_resp_2021$hhsize== "4"] <- "4. 4 or more persons"

## Homeownership 
table(nl_resp_2021$Huurkoop) # Inspecting the levels of the variable 
nl_resp_2021$homeowner <- nl_resp_2021$Huurkoop # Create new homeownership variable so that the original one is still available after recoding 
# Group all rent categories together in 1 new category: 
nl_resp_2021$homeowner[nl_resp_2021$homeowner == "Verhuurder is woningcorporatie" | 
                        nl_resp_2021$homeowner == "Onbekend" | # Homeownership = unknown can only be if household is renting the house (all owned houses are registered) 
                        nl_resp_2021$homeowner == "Verhuurder anders dan woningcorporatie"] <- "Huur" # Group all rent categories into 1 category 
nl_resp_2021$homeowner[is.na(nl_resp_2021$homeowner)] <- "Huur" # Recode missing as rent (because all owned houses are registered, and would therefore not be missing)

## Education level 
table(nl_resp_2021$OplNivHB) # Inspecting the levels of the variable
nl_resp_2021 <- rename(nl_resp_2021, edulev = OplNivHB) # Rename so variable name matches in all three countries 
# Recode edulev into fewer categories: 
nl_resp_2021$edulev[nl_resp_2021$edulev == "Basisonderwijs" | nl_resp_2021$edulev == "Vmbo, havo-, vwo-onderbouw, mbo 1"] <- "Basisonderwijs, vmbo, havo-, vwo onderbouw, mbo 1"
nl_resp_2021$edulev[nl_resp_2021$edulev == "Hbo-, wo-bachelor" | nl_resp_2021$edulev == "Hbo-, wo-master, doctor"] <- "Hbo-, wo bachelor/master, doctor"
nl_resp_2021$edulev[is.na(nl_resp_2021$edulev)] <- "Unknown" 

## Origin 
table(nl_resp_2021$ETNGROEPKORT1) # Inspecting the levels of the variable  
nl_resp_2021 <- rename(nl_resp_2021, origin = ETNGROEPKORT1) # Rename to have simpler variable name
nl_resp_2021$origin <- as.numeric(nl_resp_2021$origin) # Change to numeric to make recoding easier 
# Rename levels of the variable: 
nl_resp_2021$origin[nl_resp_2021$origin == 1] <- "1. Dutch" 
nl_resp_2021$origin[nl_resp_2021$origin == 3] <- "2. Non-western immigrant"
nl_resp_2021$origin[nl_resp_2021$origin == 4] <- "3. Western immigrant"
table(nl_resp_2021$origin) # Check if recoding was done correctly 

## Recode into factor
cnames <- c("age", "edulev", "origin", "hhsize", "urban", "homeowner", "income") # Create list with all variable names 
nl_resp_2021[,cnames] <- lapply(nl_resp_2021[,cnames], factor) # Recode all variables in list to factor

# Remove from workspace: 
rm(nl_pop_2021) 
rm(cnames)

###### Recode expenditure data ###### 
## 2020 ##
nl_resp_2020 <- as.data.frame(NA) # Create empty nl_resp dataframe for 2020. 

## Total amount per household
Total_amount_2020 <- nl_exp_2020[,c("UserID", "Price")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T) # Calculate total amount per household (independent of purchase) 
Total_amount_2020 <- rename(Total_amount_2020, amount = Price) # Rename price to amount  

nl_resp_2020 <- as.data.frame(Total_amount_2020) # Add total amount to nl_resp file
nl_resp_2020 <- merge(nl_resp_2020, correction_2020, by = "UserID") # Add weights to nl_resp_2020, only keeping households that have both purchases and weights
rm(Total_amount_2020) # Remove total amount dataframe because it is no longer needed  

# Standardize total amount: 
amount.w <- sum(nl_resp_2020$amount*nl_resp_2020$iwPU)/sum(nl_resp_2020$iwPU) # Calculate weighted average of total amount per household 
nl_resp_2020$amount_std_w <- nl_resp_2020$amount/amount.w # Standardize total amount using the weighted average
nl_resp_2020$amount_std <- nl_resp_2020$amount/mean(nl_resp_2020$amount) # Standardize amount using unweighted average

## Average amount per purchase per household 
# Calculate total amount per purchase (independent of household): 
Average_amount_2020_1 <- nl_exp_2020[,c("PurchaseID", "Price")] %>% 
  group_by(PurchaseID) %>% 
  summarise_all(.funs = sum, na.rm = T) # Calculate total amount per purchase (independent of household) 
ID_2020 <- nl_exp_2020[,c("UserID", "PurchaseID")] # Create file with IDs to use for merging
ID_2020 <- ID_2020[!duplicated(ID_2020$PurchaseID),] # Make sure each purchase ID only occurs once in ID file (to avoid duplicates later on)
Average_amount_2020_2 <- merge(x = Average_amount_2020_1,  y = ID_2020, by = "PurchaseID") # Add user IDs back in

# Standardize unweighted: 
avg.amount <- mean(Average_amount_2020_2$Price) # Calculate average amount per purchase for NL in 2020
Average_amount_2020_2$Price_std <- Average_amount_2020_2$Price/avg.amount # Standardize amount per purchase 

# Standardized weighted: 
Average_amount_2020_23 <- merge(Average_amount_2020_2, inclusion_2020, all.x = TRUE) # Add inclusion weights 
Average_amount_2020_23$weightedamount <- Average_amount_2020_23$Price * Average_amount_2020_23$iwPU # Calculate weighted total amount (independent of household)
Average_amount_2020_23 <- na.omit(Average_amount_2020_23) # Remove rows with missing amount or missing weight
avg.amount.w <- sum(Average_amount_2020_23$weightedamount)/sum(Average_amount_2020_23$iwPU) # Calculate weighted mean of amount per purchase 
Average_amount_2020_23$Price_std_w <- Average_amount_2020_23$Price/avg.amount.w # Standardize amount per purchase using the weighted mean 

# Calculate average amount per purchase per household 
Average_amount_2020_3 <- Average_amount_2020_2[,c("UserID", "Price_std")] %>% # Unweighted 
  group_by(UserID) %>% 
  summarise_all(.funs = mean, na.rm = T) # Calculate average standardized amount per purchase for each household
Average_amount_2020_3 <- rename(Average_amount_2020_3, amountavg_std = Price_std) # Rename price to amountavg 
nl_resp_2020 <- merge(nl_resp_2020, Average_amount_2020_3, by = "UserID") # Add to nl_resp file

Average_amount_2020_33 <- Average_amount_2020_23[,c("UserID", "Price_std_w")] %>% # Weighted
  group_by(UserID) %>% 
  summarise_all(.funs = mean, na.rm = T) # Calculate average standardized amount per purchase for each household
Average_amount_2020_33 <- rename(Average_amount_2020_33, amountavg_std_w = Price_std_w) # Rename price to amountavg 
nl_resp_2020 <- merge(nl_resp_2020, Average_amount_2020_33, by = "UserID") # Add to nl_resp file

## Total number of products per household
Aantal_producten_huishouden_2020 <- 
  nl_exp_2020[,c("PurchaseID", "UserID")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = length) # Calculate number of rows per household (will equal number of products because each row contains 1 product)
Aantal_producten_huishouden_2020 <- rename(Aantal_producten_huishouden_2020, nprod = PurchaseID) # Rename to nprod 

nl_resp_2020 <- merge(nl_resp_2020, Aantal_producten_huishouden_2020, by = "UserID") # Add to nl_resp file

rm(Aantal_producten_huishouden_2020) # Remove number of products file because it is no longer needed 

nprod.w <- sum(nl_resp_2020$nprod*nl_resp_2020$iwPU)/sum(nl_resp_2020$iwPU) # Calculate weighted average number of products per household
nl_resp_2020$nprod_std_w <- nl_resp_2020$nprod/nprod.w # Standardize number of products by dividing number of products by weighted average for each household
nl_resp_2020$nprod_std <- nl_resp_2020$nprod/mean(nl_resp_2020$nprod) # Standardzie number of products by dividing number of products by unweighted average for each household 

## Average number of products per purchase per household 
# Calculate number of products per purchase (independent of household) 
Average_products_2020_1 <- nl_exp_2020[,c("PurchaseID", "UserID")] %>% 
  group_by(PurchaseID) %>% 
  summarise_all(.funs = length)  # Calculate how many rows with that purchase ID exist, because that will equal the number of products 
Average_products_2020_1 <- rename(Average_products_2020_1, nprodavg = UserID) # Rename to nprodavg
Average_products_2020_2 <- merge(x = Average_products_2020_1,  y = ID_2020, by = "PurchaseID") # Add user ids back in

# Standardize unweighted: 
avg.nprod <- mean(Average_products_2020_2$nprodavg) # Calculate unweighted average number of products per purchase 
Average_products_2020_2$nprodavg_std <- Average_products_2020_2$nprodavg/avg.nprod # Standardize average number of products per purchase using the unweighted average 

# Standardized weighted: 
Average_products_2020_23 <- merge(Average_products_2020_2, inclusion_2020, all.x = TRUE) # Add inclusion weights
Average_products_2020_23$weightedprod <- Average_products_2020_23$nprodavg * Average_products_2020_23$iwPU # Calculate weighted total number of products per purchase
Average_products_2020_23 <- na.omit(Average_products_2020_23) # Remove any rows with missing purchases or missing weights 
avg.nprod.w <- sum(Average_products_2020_23$weightedprod)/sum(Average_products_2020_23$iwPU) # Calculate weighted average number of products per purchase
Average_products_2020_23$nprodavg_std_w <- Average_products_2020_23$nprodavg/avg.nprod.w # Standardize number of products per purchase using the weighted average

# Calculate average number of products per purchase per household: 
Average_products_2020_3 <- Average_products_2020_2[,c("UserID", "nprodavg_std")] %>%  # Unweighted  
  group_by(UserID) %>% 
  summarise_all(.funs = mean, na.rm = T) # Calculate average number of products per purchase per household

nl_resp_2020 <- merge(nl_resp_2020, Average_products_2020_3, by = "UserID") # Add to nl_resp file 

Average_products_2020_33 <- Average_products_2020_23[,c("UserID", "nprodavg_std_w")] %>% # Weighted 
  group_by(UserID) %>% 
  summarise_all(.funs = mean, na.rm = T) # Calculate average number of products per purchase per household  

nl_resp_2020 <- merge(nl_resp_2020, Average_products_2020_33, by = "UserID") # Add to nl_resp file 

## Min and max values 
# Minimum amount per purchase 
Min_amount_2020 <- Average_amount_2020_2[,c("UserID", "Price_std")] %>% # unweighted 
  group_by(UserID) %>% 
  summarise_all(.funs = min, na.rm = T)
Min_amount_2020 <- rename(Min_amount_2020, amountmin = Price_std) # rename 

Min_amount_2020_1 <- Average_amount_2020_23[,c("UserID", "Price_std_w")] %>% # unweighted 
  group_by(UserID) %>% 
  summarise_all(.funs = min, na.rm = T)
Min_amount_2020_1 <- rename(Min_amount_2020_1, amountmin_w = Price_std_w) # rename 

# Maximum amount per purchase 
Max_amount_2020 <- Average_amount_2020_2[,c("UserID", "Price_std")] %>% # unweighted 
  group_by(UserID) %>% 
  summarise_all(.funs = max, na.rm = T)
Max_amount_2020 <- rename(Max_amount_2020, amountmax = Price_std) # rename 

Max_amount_2020_1 <- Average_amount_2020_23[,c("UserID", "Price_std_w")] %>% # weighted
  group_by(UserID) %>% 
  summarise_all(.funs = max, na.rm = T)
Max_amount_2020_1 <- rename(Max_amount_2020_1, amountmax_w = Price_std_w) # rename 

# Minimum number of products per purchase 
Min_products_2020 <- Average_products_2020_2[,c("UserID", "nprodavg_std")] %>% # unweighted 
  group_by(UserID) %>% 
  summarise_all(.funs = min, na.rm = T)
Min_products_2020 <- rename(Min_products_2020, nprodmin = nprodavg_std) # rename 

Min_products_2020_1 <- Average_products_2020_23[,c("UserID", "nprodavg_std_w")] %>% # weighted 
  group_by(UserID) %>% 
  summarise_all(.funs = min, na.rm = T)
Min_products_2020_1 <- rename(Min_products_2020_1, nprodmin_w = nprodavg_std_w) # rename 

# Maximum number of products per purchase 
Max_products_2020 <- Average_products_2020_2[,c("UserID", "nprodavg_std")] %>% # unweighted 
  group_by(UserID) %>% 
  summarise_all(.funs = max, na.rm = T)
Max_products_2020 <- rename(Max_products_2020, nprodmax = nprodavg_std) 

Max_products_2020_1 <- Average_products_2020_23[,c("UserID", "nprodavg_std_w")] %>% # weighted 
  group_by(UserID) %>% 
  summarise_all(.funs = max, na.rm = T)
Max_products_2020_1 <- rename(Max_products_2020_1, nprodmax_w = nprodavg_std_w)

merge_fun1 <- function(x, y){ # write function for merging multiple files by UserID 
  df <- merge(x, y, by = "UserID") 
  return(df)
}

nl_resp_2020 <- Reduce(merge_fun1, list(nl_resp_2020, 
                                        Min_amount_2020, Max_amount_2020, Min_amount_2020_1, Max_amount_2020_1,
                                        Min_products_2020, Max_products_2020, Min_products_2020_1, Max_products_2020_1)) # merge min and max files to nl_resp file 

# Remove from workspace 
rm(Min_amount_2020)
rm(Min_products_2020)
rm(Max_amount_2020)
rm(Max_products_2020)
rm(Min_amount_2020_1)
rm(Min_products_2020_1)
rm(Max_amount_2020_1)
rm(Max_products_2020_1)

## Number of entries per household 
Aantal_entries_2020 <- as.data.frame(with(nl_exp_2020, aggregate(cbind(PurchaseID) ~ UserID, FUN = function(x){length(unique(x))}))) # count the number of unique purchase ids per household
Aantal_entries_2020 <- rename(Aantal_entries_2020, entries = PurchaseID) # rename 
nl_resp_2020 <- merge(nl_resp_2020, Aantal_entries_2020, by = "UserID") # merge with nl_resp file 

rm(Aantal_entries_2020) # remove from workspace 

## create difference variables (amount range and products range)
# unweighted 
nl_resp_2020$amountvar <- NA 
nl_resp_2020$amountvar <- (nl_resp_2020$amountmax - nl_resp_2020$amountmin) # amount 
nl_resp_2020$nprodvar <- NA 
nl_resp_2020$nprodvar <- (nl_resp_2020$nprodmax - nl_resp_2020$nprodmin) # products 

# weighted 
nl_resp_2020$amountvar_w <- NA 
nl_resp_2020$amountvar_w <- (nl_resp_2020$amountmax_w - nl_resp_2020$amountmin_w) # amount 
nl_resp_2020$nprodvar_w <- NA 
nl_resp_2020$nprodvar_w <- (nl_resp_2020$nprodmax_w - nl_resp_2020$nprodmin_w) # products 

# Households with only one entry will have no variability in data, so amount range and products range will be set to NA
nl_resp_2020$amountvar[nl_resp_2020$entries <= 1] <- NA 
nl_resp_2020$nprodvar[nl_resp_2020$entries <= 1] <- NA 
nl_resp_2020$amountvar_w[nl_resp_2020$entries <= 1] <- NA 
nl_resp_2020$nprodvar_w[nl_resp_2020$entries <= 1] <- NA 

## Calculate standard deviations over amount per purchase and nprod per purchase 
# Amount 
sd_amount <- Average_amount_2020_2[,c("UserID", "Price_std")] %>% # unweighted 
  group_by(UserID) %>% 
  summarise_all(.funs = sd, na.rm = T)
sd_amount <- rename(sd_amount, amountsd = Price_std)

nl_resp_2020 <- merge(nl_resp_2020, sd_amount, by = "UserID", all.x = TRUE) # merge

sd_amount_1 <- Average_amount_2020_23[,c("UserID", "Price_std_w")] %>% # unweighted 
  group_by(UserID) %>% 
  summarise_all(.funs = sd, na.rm = T)
sd_amount_1 <- rename(sd_amount_1, amountsd_w = Price_std_w)
nl_resp_2020 <- merge(nl_resp_2020, sd_amount_1, by = "UserID", all.x = TRUE) # merge 

# Nprod 
sd_nprod <- Average_products_2020_2[,c("UserID", "nprodavg_std")] %>% # unweighted 
  group_by(UserID) %>% 
  summarise_all(.funs = sd, na.rm = T) 
sd_nprod <- rename(sd_nprod, nprodsd = nprodavg_std)

nl_resp_2020 <- merge(nl_resp_2020, sd_nprod, by = "UserID", all.x = TRUE) # merge 

sd_nprod_1 <- Average_products_2020_23[,c("UserID", "nprodavg_std_w")] %>% # weighted 
  group_by(UserID) %>% 
  summarise_all(.funs = sd, na.rm = T) 
sd_nprod_1 <- rename(sd_nprod_1, nprodsd_w = nprodavg_std_w)

nl_resp_2020 <- merge(nl_resp_2020, sd_nprod_1, by = "UserID", all.x = TRUE) # merge 

nl_resp_2020$amountsd[nl_resp_2020$entries <= 1] <- NA # if a household has only 1 entry, there is never any variation in the data, so sd should be missing
nl_resp_2020$nprodsd[nl_resp_2020$entries <= 1] <- NA
nl_resp_2020$amountsd_w[nl_resp_2020$entries <= 1] <- NA 
nl_resp_2020$nprodsd_w[nl_resp_2020$entries <= 1] <- NA

# Remove from workspace 
rm(Average_amount_2020_1)
rm(Average_amount_2020_2)
rm(Average_amount_2020_23)
rm(Average_amount_2020_3)
rm(Average_amount_2020_33)
rm(Average_products_2020_1)
rm(Average_products_2020_2)
rm(Average_products_2020_23)
rm(Average_products_2020_3)
rm(Average_products_2020_33)
rm(sd_amount)
rm(sd_amount_1)
rm(sd_nprod)
rm(sd_nprod_1)
rm(amount.w)
rm(avg.amount)
rm(avg.amount.w)
rm(avg.nprod)
rm(avg.nprod.w)
rm(nprod.w)

## Number of categories in which a purchase was made 
# Existing sotorecodes need to be changed to simplified coding, so that only 11 categories remain
nl_exp_2020$StoreCat[nl_exp_2020$StoreCode == 100] <- "A" # Petrol station
nl_exp_2020$StoreCat[nl_exp_2020$StoreCode == 101 | nl_exp_2020$StoreCode == 102] <- "B" # Superstores
nl_exp_2020$StoreCat[nl_exp_2020$StoreCode == 103] <- "C" # Supermarket
nl_exp_2020$StoreCat[nl_exp_2020$StoreCode == 104 | nl_exp_2020$StoreCode == 105 | 
                       nl_exp_2020$StoreCode == 106 | nl_exp_2020$StoreCode == 107 |
                       nl_exp_2020$StoreCode == 108 | nl_exp_2020$StoreCode == 109 | 
                       nl_exp_2020$StoreCode == 110 |nl_exp_2020$StoreCode == 111 | 
                       nl_exp_2020$StoreCode == 171] <- "D" # Specialised food and drink stores 
nl_exp_2020$StoreCat[nl_exp_2020$StoreCode == 113 | nl_exp_2020$StoreCode == 114] <- "E" # Clothing and shoes
nl_exp_2020$StoreCat[nl_exp_2020$StoreCode == 130] <- "F" # Drug store/perfumery
nl_exp_2020$StoreCat[nl_exp_2020$StoreCode == 115 | nl_exp_2020$StoreCode == 116 | 
                       nl_exp_2020$StoreCode == 119] <- "G" # Home interior
nl_exp_2020$StoreCat[nl_exp_2020$StoreCode == 120 | nl_exp_2020$StoreCode == 122 | 
                       nl_exp_2020$StoreCode == 163 | nl_exp_2020$StoreCode == 163 |
                       nl_exp_2020$StoreCode == 175] <- "H" # Garden and house
nl_exp_2020$StoreCat[nl_exp_2020$StoreCode == 176 | nl_exp_2020$StoreCode == 177 | 
                       nl_exp_2020$StoreCode == 178] <- "I" # Lunching and dining (horeca)
nl_exp_2020$StoreCat[nl_exp_2020$StoreCode == 127 | nl_exp_2020$StoreCode == 118 | 
                       nl_exp_2020$StoreCode == 125 | nl_exp_2020$StoreCode == 134 |
                       nl_exp_2020$StoreCode == 135] <- "J" # Media and books
nl_exp_2020$StoreCat[is.na(nl_exp_2020$StoreCat)] <- "K" # All other types

# Create dummies to use for counting number of categories 
nl_exp_2020$StoreCatA[nl_exp_2020$StoreCat == "A"] <- 1 
nl_exp_2020$StoreCatA[is.na(nl_exp_2020$StoreCatA)] <- 0 
nl_exp_2020$StoreCatB[nl_exp_2020$StoreCat == "B"] <- 1
nl_exp_2020$StoreCatB[is.na(nl_exp_2020$StoreCatB)] <- 0 
nl_exp_2020$StoreCatC[nl_exp_2020$StoreCat == "C"] <- 1
nl_exp_2020$StoreCatC[is.na(nl_exp_2020$StoreCatC)] <- 0
nl_exp_2020$StoreCatD[nl_exp_2020$StoreCat == "D"] <- 1
nl_exp_2020$StoreCatD[is.na(nl_exp_2020$StoreCatD)] <- 0
nl_exp_2020$StoreCatE[nl_exp_2020$StoreCat == "E"] <- 1
nl_exp_2020$StoreCatE[is.na(nl_exp_2020$StoreCatE)] <- 0
nl_exp_2020$StoreCatF[nl_exp_2020$StoreCat == "F"] <- 1
nl_exp_2020$StoreCatF[is.na(nl_exp_2020$StoreCatF)] <- 0
nl_exp_2020$StoreCatG[nl_exp_2020$StoreCat == "G"] <- 1
nl_exp_2020$StoreCatG[is.na(nl_exp_2020$StoreCatG)] <- 0
nl_exp_2020$StoreCatH[nl_exp_2020$StoreCat == "H"] <- 1
nl_exp_2020$StoreCatH[is.na(nl_exp_2020$StoreCatH)] <- 0
nl_exp_2020$StoreCatI[nl_exp_2020$StoreCat == "I"] <- 1
nl_exp_2020$StoreCatI[is.na(nl_exp_2020$StoreCatI)] <- 0
nl_exp_2020$StoreCatJ[nl_exp_2020$StoreCat == "J"] <- 1
nl_exp_2020$StoreCatJ[is.na(nl_exp_2020$StoreCatJ)] <- 0
nl_exp_2020$StoreCatK[nl_exp_2020$StoreCat == "K"] <- 1
nl_exp_2020$StoreCatK[is.na(nl_exp_2020$StoreCatK)] <- 0

# Check for each purchase id whether purchase was in that category
StoreCatA_Tot <- nl_exp_2020[,c("PurchaseID", "StoreCatA")] %>%  
  group_by(PurchaseID) 
StoreCatB_Tot <- nl_exp_2020[,c("PurchaseID", "StoreCatB")] %>% 
  group_by(PurchaseID) 
StoreCatC_Tot <- nl_exp_2020[,c("PurchaseID", "StoreCatC")] %>% 
  group_by(PurchaseID)
StoreCatD_Tot <- nl_exp_2020[,c("PurchaseID", "StoreCatD")] %>% 
  group_by(PurchaseID) 
StoreCatE_Tot <- nl_exp_2020[,c("PurchaseID", "StoreCatE")] %>% 
  group_by(PurchaseID) 
StoreCatF_Tot <- nl_exp_2020[,c("PurchaseID", "StoreCatF")] %>% 
  group_by(PurchaseID)
StoreCatG_Tot <- nl_exp_2020[,c("PurchaseID", "StoreCatG")] %>% 
  group_by(PurchaseID)
StoreCatH_Tot <- nl_exp_2020[,c("PurchaseID", "StoreCatH")] %>% 
  group_by(PurchaseID)
StoreCatI_Tot <- nl_exp_2020[,c("PurchaseID", "StoreCatI")] %>% 
  group_by(PurchaseID)
StoreCatJ_Tot <- nl_exp_2020[,c("PurchaseID", "StoreCatJ")] %>% 
  group_by(PurchaseID) 
StoreCatK_Tot <- nl_exp_2020[,c("PurchaseID", "StoreCatK")] %>% 
  group_by(PurchaseID) 

# remove duplicate purchase ids 
StoreCatA_Tot <- StoreCatA_Tot[!duplicated(StoreCatA_Tot$PurchaseID),] 
StoreCatB_Tot <- StoreCatB_Tot[!duplicated(StoreCatB_Tot$PurchaseID),]
StoreCatC_Tot <- StoreCatC_Tot[!duplicated(StoreCatC_Tot$PurchaseID),]
StoreCatD_Tot <- StoreCatD_Tot[!duplicated(StoreCatD_Tot$PurchaseID),]
StoreCatE_Tot <- StoreCatE_Tot[!duplicated(StoreCatE_Tot$PurchaseID),]
StoreCatF_Tot <- StoreCatF_Tot[!duplicated(StoreCatF_Tot$PurchaseID),]
StoreCatG_Tot <- StoreCatG_Tot[!duplicated(StoreCatG_Tot$PurchaseID),]
StoreCatH_Tot <- StoreCatH_Tot[!duplicated(StoreCatH_Tot$PurchaseID),]
StoreCatI_Tot <- StoreCatI_Tot[!duplicated(StoreCatI_Tot$PurchaseID),]
StoreCatJ_Tot <- StoreCatJ_Tot[!duplicated(StoreCatJ_Tot$PurchaseID),]
StoreCatK_Tot <- StoreCatK_Tot[!duplicated(StoreCatK_Tot$PurchaseID),]

merge_fun2 <- function(x, y){ # write function for merging multiple files by purchase id 
  df <- merge(x, y, by = "PurchaseID")
  return(df)
}

StoreCat_Merged <- Reduce(merge_fun2, list(StoreCatA_Tot, StoreCatB_Tot, StoreCatC_Tot, 
                                          StoreCatD_Tot, StoreCatE_Tot, StoreCatF_Tot, 
                                          StoreCatG_Tot, StoreCatH_Tot, StoreCatI_Tot, 
                                          StoreCatJ_Tot, StoreCatK_Tot)) # merge all storecat total files together



# Remove from workspace  
rm(StoreCatA_Tot)
rm(StoreCatB_Tot)
rm(StoreCatC_Tot)
rm(StoreCatD_Tot)
rm(StoreCatE_Tot)
rm(StoreCatF_Tot)
rm(StoreCatG_Tot)
rm(StoreCatH_Tot)
rm(StoreCatI_Tot)
rm(StoreCatJ_Tot)
rm(StoreCatK_Tot)

# Merge 
StoreCat_Merged <- merge(x = StoreCat_Merged,  y = ID_2020, by = "PurchaseID") # add user ids 

# Calculate for each household how many purchases were made in each category 
StoreCatA_Tot <- StoreCat_Merged[,c("UserID", "StoreCatA")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T)
StoreCatB_Tot <- StoreCat_Merged[,c("UserID", "StoreCatB")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T)
StoreCatC_Tot <- StoreCat_Merged[,c("UserID", "StoreCatC")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T)
StoreCatD_Tot <- StoreCat_Merged[,c("UserID", "StoreCatD")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T)
StoreCatE_Tot <- StoreCat_Merged[,c("UserID", "StoreCatE")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T)
StoreCatF_Tot <- StoreCat_Merged[,c("UserID", "StoreCatF")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T)
StoreCatG_Tot <- StoreCat_Merged[,c("UserID", "StoreCatG")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T)
StoreCatH_Tot <- StoreCat_Merged[,c("UserID", "StoreCatH")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T)
StoreCatI_Tot <- StoreCat_Merged[,c("UserID", "StoreCatI")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T)
StoreCatJ_Tot <- StoreCat_Merged[,c("UserID", "StoreCatJ")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T)
StoreCatK_Tot <- StoreCat_Merged[,c("UserID", "StoreCatK")] %>% 
  group_by(UserID) %>% 
  summarise_all(.funs = sum, na.rm = T)

StoreCat_Merged <- Reduce(merge_fun1, list(StoreCatA_Tot, StoreCatB_Tot, StoreCatC_Tot, 
                                          StoreCatD_Tot, StoreCatE_Tot, StoreCatF_Tot, 
                                          StoreCatG_Tot, StoreCatH_Tot, StoreCatI_Tot, 
                                          StoreCatJ_Tot, StoreCatK_Tot)) # merge together 

# Remove from workspace 
rm(StoreCatA_Tot)
rm(StoreCatB_Tot)
rm(StoreCatC_Tot)
rm(StoreCatD_Tot)
rm(StoreCatE_Tot)
rm(StoreCatF_Tot)
rm(StoreCatG_Tot)
rm(StoreCatH_Tot)
rm(StoreCatI_Tot)
rm(StoreCatJ_Tot)
rm(StoreCatK_Tot)
rm(merge_fun1)
rm(merge_fun2)

# Merge with nl_resp file
nl_resp_2020 <- merge(nl_resp_2020, StoreCat_Merged, by = "UserID") 

# Rename variables 
nl_resp_2020 <- rename(nl_resp_2020, catA = StoreCatA)  
nl_resp_2020 <- rename(nl_resp_2020, catB = StoreCatB)
nl_resp_2020 <- rename(nl_resp_2020, catC = StoreCatC)
nl_resp_2020 <- rename(nl_resp_2020, catD = StoreCatD)
nl_resp_2020 <- rename(nl_resp_2020, catE = StoreCatE)
nl_resp_2020 <- rename(nl_resp_2020, catF = StoreCatF)
nl_resp_2020 <- rename(nl_resp_2020, catG = StoreCatG) 
nl_resp_2020 <- rename(nl_resp_2020, catH = StoreCatH)
nl_resp_2020 <- rename(nl_resp_2020, catI = StoreCatI)
nl_resp_2020 <- rename(nl_resp_2020, catJ = StoreCatJ)
nl_resp_2020 <- rename(nl_resp_2020, catK = StoreCatK)

rm(StoreCat_Merged) # remove  from workspace 

# Make variable to count in how many categories purchases were reported
nl_resp_2020 <- nl_resp_2020 %>% 
  rowwise() %>%
  mutate(ncat = sum(c_across(catA:catK) > 0)) %>%
  ungroup()  

# Expected number categories per household 
nl_resp_2020$catexpected <- NA # create new variable 
nl_resp_2020$catexpected <- ( # calculate expected number of store categories 
    (1-((1-(nl_resp_2020$catA/nl_resp_2020$entries))^nl_resp_2020$entries)) + 
    (1-((1-(nl_resp_2020$catB/nl_resp_2020$entries))^nl_resp_2020$entries)) + 
    (1-((1-(nl_resp_2020$catC/nl_resp_2020$entries))^nl_resp_2020$entries)) + 
    (1-((1-(nl_resp_2020$catD/nl_resp_2020$entries))^nl_resp_2020$entries)) + 
    (1-((1-(nl_resp_2020$catE/nl_resp_2020$entries))^nl_resp_2020$entries)) + 
    (1-((1-(nl_resp_2020$catF/nl_resp_2020$entries))^nl_resp_2020$entries)) + 
    (1-((1-(nl_resp_2020$catG/nl_resp_2020$entries))^nl_resp_2020$entries)) + 
    (1-((1-(nl_resp_2020$catH/nl_resp_2020$entries))^nl_resp_2020$entries)) + 
    (1-((1-(nl_resp_2020$catI/nl_resp_2020$entries))^nl_resp_2020$entries)) + 
    (1-((1-(nl_resp_2020$catJ/nl_resp_2020$entries))^nl_resp_2020$entries)) + 
    (1-((1-(nl_resp_2020$catK/nl_resp_2020$entries))^nl_resp_2020$entries))
)

# Remove  from workspace 
rm(ID_2020)
rm(nl_exp_2020)

## 2021 ## 
nl_weeks <- NA #create nl weeks file 

## Amount per purchase 
nl_transplus$amountw1 <- NA
nl_transplus$amountw1 <- nl_transplus$amount0_3 + nl_transplus$amount4_7 # days 0-7 are week 1
nl_transplus_w1 <- subset(nl_transplus, amountw1 > 0) # subset households with at least 1 purchase in week 1 
nl_transplus_w1$amountw1 <- nl_transplus_w1$amountw1/mean(nl_transplus_w1$amountw1, na.rm = TRUE) # standardize amount 
Min_amountw1 <- nl_transplus_w1[,c("user_id", "amountw1")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = min, na.rm = T) # calculate minimum amount per purchase 
Min_amountw1 <- rename(Min_amountw1, amountminw1 = amountw1) # rename 
Max_amountw1 <- nl_transplus_w1[,c("user_id", "amountw1")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = max, na.rm = T) # calculate maximum amount per purchase 
Max_amountw1 <- rename(Max_amountw1, amountmaxw1 = amountw1) # rename 
nl_weeks <- merge(Max_amountw1, Min_amountw1, by = "user_id", all = TRUE) # merge 

# Remove from workspace 
rm(Max_amountw1) 
rm(Min_amountw1)

nl_transplus$amountw2 <- NA # same proces for week 2 
nl_transplus$amountw2 <- nl_transplus$amount8_11 + nl_transplus$amount12_14 # days 8-14 are week 1 
nl_transplus_w2 <- subset(nl_transplus, amountw2 > 0) # same process for week 2, sub set households with at least 1 purchase in week 2 
nl_transplus_w2$amountw2 <- nl_transplus_w2$amountw2/mean(nl_transplus_w2$amountw2, na.rm = TRUE) # standardize amount 
Min_amountw2 <- nl_transplus_w2[,c("user_id", "amountw2")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = min, na.rm = T)
Min_amountw2 <- rename(Min_amountw2, amountminw2 = amountw2)
Max_amountw2 <- nl_transplus_w2[,c("user_id", "amountw2")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = max, na.rm = T)
Max_amountw2 <- rename(Max_amountw2, amountmaxw2 = amountw2)
nl_weeks <- merge(nl_weeks, Max_amountw2, by = "user_id", all = TRUE)
nl_weeks <- merge(nl_weeks, Min_amountw2, by = "user_id", all = TRUE)

# Remove from workspace 
rm(Max_amountw2)
rm(Min_amountw2)

## Sd over purchase amounts 
# Per week 
sd_amountw1 <- nl_transplus_w1[,c("user_id", "amountw1")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sd, na.rm = T) # calculate sd over all purchases within a household 
sd_amountw1 <- rename(sd_amountw1, sdamountw1 = amountw1) # rename 

sd_amountw2 <- nl_transplus_w2[,c("user_id", "amountw2")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sd, na.rm = T) # calculate sd over all purchases within a household 
sd_amountw2 <- rename(sd_amountw2, sdamountw2 = amountw2) # rename 

# Overall 
nl_transplus$amounttot <- NA # first need to calculate the total amount overall, start by creating empty variable 
nl_transplus$amounttot <- nl_transplus$amount0_3 + nl_transplus$amount4_7 + nl_transplus$amount8_11 + nl_transplus$amount12_14 + nl_transplus$amount15_18 + nl_transplus$amount19_inf # sum all entries per household 
nl_transplus$amounttot <- nl_transplus$amounttot/mean(nl_transplus$amounttot, na.rm = TRUE) # divide total amount by average amount for each household (to standardize)

amounttotmin <- NA 
amounttotmax <- NA 

amounttotmin <- nl_transplus[,c("user_id", "amounttot")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = min, na.rm = T)
amounttotmin <- rename(amounttotmin, amounttotmin = amounttot)
amounttotmax <- nl_transplus[,c("user_id", "amounttot")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = max, na.rm = T)
amounttotmax <- rename(amounttotmax, amounttotmax = amounttot)

sd_amounttot <- nl_transplus[,c("user_id", "amounttot")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sd, na.rm = T)
sd_amounttot <- rename(sd_amounttot, sdamounttot = amounttot)

nl_weeks <- merge(nl_weeks, sd_amountw1, by = "user_id", all = TRUE) # add standard deviations to nl weeks file 
nl_weeks <- merge(nl_weeks, sd_amountw2, by = "user_id", all = TRUE)
nl_weeks <- merge(nl_weeks, sd_amounttot, by = "user_id", all = TRUE)
nl_weeks <- merge(nl_weeks, amounttotmin, by = "user_id", all = TRUE)
nl_weeks <- merge(nl_weeks, amounttotmax, by = "user_id", all = TRUE)

# Remove from workspace 
rm(sd_amountw1) 
rm(sd_amountw2)
rm(sd_amounttot)
rm(amounttotmax)
rm(amounttotmin)

## Number of products per week  
nl_transplus_w1$nprodw1 <- NA # repeat same steps as amount for number of products in weeks 1 and 2 
nl_transplus_w1$nprodw1 <- nl_transplus_w1$nprod0_3 + nl_transplus_w1$nprod4_7
nl_transplus_w1$nprodw1 <- nl_transplus_w1$nprodw1/mean(nl_transplus_w1$nprodw1, na.rm = TRUE)
Min_nprodw1 <- nl_transplus_w1[,c("user_id", "nprodw1")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = min, na.rm = T)
Min_nprodw1 <- rename(Min_nprodw1, nprodminw1 = nprodw1)
nl_weeks <- merge(nl_weeks, Min_nprodw1, by = "user_id", all = TRUE)
rm(Min_nprodw1)

Max_nprodw1 <- nl_transplus_w1[,c("user_id", "nprodw1")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = max, na.rm = T)
Max_nprodw1 <- rename(Max_nprodw1, nprodmaxw1 = nprodw1)
nl_weeks <- merge(nl_weeks, Max_nprodw1, by = "user_id", all = TRUE)
rm(Max_nprodw1)

nl_transplus_w2$nprodw2 <- NA
nl_transplus_w2$nprodw2 <- nl_transplus_w2$nprod8_11 + nl_transplus_w2$nprod12_14
nl_transplus_w2$nprodw2 <- nl_transplus_w2$nprodw2/mean(nl_transplus_w2$nprodw2, na.rm = TRUE)
Min_nprodw2 <- nl_transplus_w2[,c("user_id", "nprodw2")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = min, na.rm = T)
Min_nprodw2 <- rename(Min_nprodw2, nprodminw2 = nprodw2)
nl_weeks <- merge(nl_weeks, Min_nprodw2, by = "user_id", all = TRUE)
rm(Min_nprodw2)

Max_nprodw2 <- nl_transplus_w2[,c("user_id", "nprodw2")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = max, na.rm = T)
Max_nprodw2 <- rename(Max_nprodw2, nprodmaxw2 = nprodw2)
nl_weeks <- merge(nl_weeks, Max_nprodw2, by = "user_id", all = TRUE)
rm(Max_nprodw2)

sd_nprodw1 <- nl_transplus_w1[,c("user_id", "nprodw1")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sd, na.rm = T)
sd_nprodw1 <- rename(sd_nprodw1, sdnprodw1 = nprodw1)

sd_nprodw2 <- nl_transplus_w2[,c("user_id", "nprodw2")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sd, na.rm = T)
sd_nprodw2 <- rename(sd_nprodw2, sdnprodw2 = nprodw2)

nl_transplus$nprodtot <- NA 
nl_transplus$nprodtot <- nl_transplus$nprod0_3 + nl_transplus$nprod4_7 + nl_transplus$nprod8_11 + nl_transplus$nprod12_14 + nl_transplus$nprod15_18 + nl_transplus$nprod19_inf
nl_transplus$nprodtot <- nl_transplus$nprodtot/mean(nl_transplus$nprodtot, na.rm = TRUE)

nprodtotmin <- NA 
nprodtotmax <- NA 

nprodtotmin <- nl_transplus[,c("user_id", "nprodtot")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = min, na.rm = T)
nprodtotmin <- rename(nprodtotmin, nprodtotmin = nprodtot)
nprodtotmax <- nl_transplus[,c("user_id", "nprodtot")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = max, na.rm = T)
nprodtotmax <- rename(nprodtotmax, nprodtotmax = nprodtot)

sd_nprodtot <- nl_transplus[,c("user_id", "nprodtot")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sd, na.rm = T)
sd_nprodtot <- rename(sd_nprodtot, sdnprodtot = nprodtot)

# Merge 
nl_weeks <- merge(nl_weeks, sd_nprodw1, by = "user_id", all = TRUE)
nl_weeks <- merge(nl_weeks, sd_nprodw2, by = "user_id", all = TRUE)
nl_weeks <- merge(nl_weeks, sd_nprodtot, by = "user_id", all = TRUE)
nl_weeks <- merge(nl_weeks, nprodtotmin, by = "user_id", all = TRUE)
nl_weeks <- merge(nl_weeks, nprodtotmax, by = "user_id")

# Remove from workspace 
rm(sd_nprodw1)
rm(sd_nprodw2)
rm(sd_nprodtot)
rm(nprodtotmin)
rm(nprodtotmax)

## Number of entries 
# Per week 
nl_transplus$entriesw1 <- NA
nl_transplus$entriesw1 <- nl_transplus$days0_3 + nl_transplus$days4_7
Entries_w1 <- nl_transplus[,c("user_id", "entriesw1")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T) # calculate number of entries in week 1 
nl_weeks <- merge(nl_weeks, Entries_w1, by = "user_id", all = TRUE) # merge
rm(Entries_w1) # remove from workspace 

nl_transplus$entriesw2 <- NA # same steps for week 2 
nl_transplus$entriesw2 <- nl_transplus$days8_11 + nl_transplus$days12_14
Entries_w2 <- nl_transplus[,c("user_id", "entriesw2")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
nl_weeks <- merge(nl_weeks, Entries_w2, by = "user_id", all = TRUE) # merge 
rm(Entries_w2) # remove from workspace 

# Overall 
nl_transplus$entries <- NA
nl_transplus$entries <- nl_transplus$days0_3 + nl_transplus$days4_7 + nl_transplus$days8_11 + nl_transplus$days12_14 + nl_transplus$days15_18 + nl_transplus$days19_inf
Entries <- nl_transplus[,c("user_id", "entries")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T) # calculate number of entries  
nl_weeks <- merge(nl_weeks, Entries, by = "user_id", all = TRUE) # merge 
rm(Entries) # remove from workspace 

nl_weeks$sdamounttot[nl_weeks$entries <= 1] <- NA # if a household has only 1 entry, there is never any variation in the data, so sd should be missing. 
nl_weeks$sdnprodtot[nl_weeks$entries <= 1] <- NA
nl_weeks$sdamountw1[nl_weeks$entriesw1 <= 1] <- NA 
nl_weeks$sdnprodw1[nl_weeks$entriesw1 <= 1] <- NA
nl_weeks$sdamountw2[nl_weeks$entriesw2 <= 1] <- NA  
nl_weeks$sdnprodw2[nl_weeks$entriesw2 <= 1] <- NA

## Categories 
merge_fun3 <- function(x, y){ # write function for merging multiple files by user id 
  df <- merge(x, y, by = "user_id", all = TRUE)
  return(df)
}

# count number of purchases in each category for week 1
w1 <- subset(nl_transplus, days0_3 == 1 | days4_7 == 1)
catAw1 <- w1[,c("user_id", "catA")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)  
catBw1 <- w1[,c("user_id", "catB")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catCw1 <- w1[,c("user_id", "catC")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catDw1 <- w1[,c("user_id", "catD")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catEw1 <- w1[,c("user_id", "catE")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catFw1 <- w1[,c("user_id", "catF")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catGw1 <- w1[,c("user_id", "catG")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catHw1 <- w1[,c("user_id", "catH")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catIw1 <- w1[,c("user_id", "catI")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catJw1 <- w1[,c("user_id", "catJ")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catKw1 <- w1[,c("user_id", "catK")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)

# Merge 
w1 <- Reduce(merge_fun3, list(catAw1, catBw1, catCw1, catDw1, catEw1, catFw1, catGw1, catHw1, catIw1, catJw1, catKw1))

# Rename 
oldnames <- c("catA", "catB", "catC", "catD", "catE", "catF", "catG", "catH", "catI", "catJ", "catK") # list with old names 
newnames <- c("catAw1", "catBw1", "catCw1", "catDw1", "catEw1", "catFw1", "catGw1", "catHw1", "catIw1", "catJw1", "catKw1") # list with new names 
for(i in 1:12) names(w1)[names(w1) == oldnames[i]] = newnames[i] # rename 
rm(oldnames) # remove from workspace 
rm(newnames) # remove from workspace 

# Count number of categories per household in week 1 
w1 <- w1 %>% 
  rowwise() %>%
  mutate(ncat = sum(c_across(catAw1:catKw1) > 0)) %>%
  ungroup() 

w1 <- rename(w1, ncatw1 = ncat) # rename 

# Repeat all the same steps for week 2 
w2 <- subset(nl_transplus, days8_11 == 1 | days12_14 == 1)
catAw2 <- w2[,c("user_id", "catA")] %>%  
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catBw2 <- w2[,c("user_id", "catB")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catCw2 <- w2[,c("user_id", "catC")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catDw2 <- w2[,c("user_id", "catD")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catEw2 <- w2[,c("user_id", "catE")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catFw2 <- w2[,c("user_id", "catF")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catGw2 <- w2[,c("user_id", "catG")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catHw2 <- w2[,c("user_id", "catH")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catIw2 <- w2[,c("user_id", "catI")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catJw2 <- w2[,c("user_id", "catJ")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
catKw2 <- w2[,c("user_id", "catK")] %>% 
  group_by(user_id) %>% 
  summarise_all(.funs = sum, na.rm = T)
w2 <- Reduce(merge_fun3, list(catAw2, catBw2, catCw2, catDw2, catEw2, catFw2, catGw2, catHw2, catIw2, catJw2, catKw2))
oldnames <- c("catA", "catB", "catC", "catD", "catE", "catF", "catG", "catH", "catI", "catJ", "catK")
newnames <- c("catAw2", "catBw2", "catCw2", "catDw2", "catEw2", "catFw2", "catGw2", "catHw2", "catIw2", "catJw2", "catKw2")
for(i in 1:12) names(w2)[names(w2) == oldnames[i]] = newnames[i]
rm(oldnames)
rm(newnames)
w2 <- w2 %>% 
  rowwise() %>%
  mutate(ncat = sum(c_across(catAw2:catKw2) > 0)) %>%
  ungroup() 

w2 <- rename(w2, ncatw2 = ncat)

userid <- nl_weeks[,1, drop = FALSE] # to use for merging 

nl_weeks <- Reduce(merge_fun3, list(userid, nl_weeks, 
                                    w1, w2))

# Remove from workspace 
rm(userid)
rm(catAw1)
rm(catBw1)
rm(catCw1)
rm(catDw1)
rm(catEw1)
rm(catFw1)
rm(catGw1)
rm(catHw1)
rm(catIw1)
rm(catJw1)
rm(catKw1)
rm(catAw2)
rm(catBw2)
rm(catCw2)
rm(catDw2)
rm(catEw2)
rm(catFw2)
rm(catGw2)
rm(catHw2)
rm(catIw2)
rm(catJw2)
rm(catKw2)
rm(w1)
rm(w2)
rm(merge_fun3)

# Expected number of categories 
nl_weeks$catexpectedw1 <- NA # week 1
nl_weeks$catexpectedw1 <- (
  (1-((1-(nl_weeks$catAw1/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catBw1/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catCw1/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catDw1/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catEw1/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catFw1/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catGw1/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catHw1/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catIw1/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catJw1/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catKw1/nl_weeks$entries))^nl_weeks$entries))
)

nl_weeks$catexpectedw2 <- NA # week 2
nl_weeks$catexpectedw2 <- (
  (1-((1-(nl_weeks$catAw2/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catBw2/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catCw2/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catDw2/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catEw2/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catFw2/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catGw2/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catHw2/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catIw2/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catJw2/nl_weeks$entries))^nl_weeks$entries)) + 
    (1-((1-(nl_weeks$catKw2/nl_weeks$entries))^nl_weeks$entries))
)

nl_weeks <- rename(nl_weeks, dbuser = user_id) # rename user id so merging is possible 
nl_resp_2021 <- merge(nl_resp_2021, nl_weeks, by = "dbuser", all = TRUE) # merge 
rm(nl_weeks) # remove from workspace 

nl_resp_2021 <- nl_resp_2021 %>% 
  rowwise() %>%
  mutate(ncat = sum(c_across(catA:catK) > 0)) %>%
  ungroup() # number of categories (store types) 

nl_resp_2021$catexpected <- NA # overall 
nl_resp_2021$catexpected <- ( # expected number of categories (store types) 
  (1-((1-(nl_resp_2021$catA/nl_resp_2021$entries))^nl_resp_2021$entries)) + 
    (1-((1-(nl_resp_2021$catB/nl_resp_2021$entries))^nl_resp_2021$entries)) + 
    (1-((1-(nl_resp_2021$catC/nl_resp_2021$entries))^nl_resp_2021$entries)) + 
    (1-((1-(nl_resp_2021$catD/nl_resp_2021$entries))^nl_resp_2021$entries)) + 
    (1-((1-(nl_resp_2021$catE/nl_resp_2021$entries))^nl_resp_2021$entries)) + 
    (1-((1-(nl_resp_2021$catF/nl_resp_2021$entries))^nl_resp_2021$entries)) + 
    (1-((1-(nl_resp_2021$catG/nl_resp_2021$entries))^nl_resp_2021$entries)) + 
    (1-((1-(nl_resp_2021$catH/nl_resp_2021$entries))^nl_resp_2021$entries)) + 
    (1-((1-(nl_resp_2021$catI/nl_resp_2021$entries))^nl_resp_2021$entries)) + 
    (1-((1-(nl_resp_2021$catJ/nl_resp_2021$entries))^nl_resp_2021$entries)) + 
    (1-((1-(nl_resp_2021$catK/nl_resp_2021$entries))^nl_resp_2021$entries))
)


## Difference variables (amount range and products range) 
nl_resp_2021$amountvarw1 <- NA
nl_resp_2021$amountvarw1 <- (nl_resp_2021$amountmaxw1 - nl_resp_2021$amountminw1) # amount range week 1

nl_resp_2021$amountvarw2 <- NA
nl_resp_2021$amountvarw2 <- (nl_resp_2021$amountmaxw2 - nl_resp_2021$amountminw2) # amount range week 2

nl_resp_2021$amountvar <- NA
nl_resp_2021$amountvar <- (nl_resp_2021$amounttotmax - nl_resp_2021$amounttotmin) # amount range overall 

nl_resp_2021$nprodvarw1 <- NA
nl_resp_2021$nprodvarw1 <- (nl_resp_2021$nprodmaxw1 - nl_resp_2021$nprodminw1) # products range week 1

nl_resp_2021$nprodvarw2 <- NA
nl_resp_2021$nprodvarw2 <- (nl_resp_2021$nprodmaxw2 - nl_resp_2021$nprodminw2) # products range week 2 

nl_resp_2021$nprodvar <- NA
nl_resp_2021$nprodvar <- (nl_resp_2021$nprodtotmax - nl_resp_2021$nprodtotmin) # products range overall 

# Households with only one entry will have no spread in data, thus var variables will be set to NA for those households
nl_resp_2021$amountvar[nl_resp_2021$entries <= 1] <- NA 
nl_resp_2021$nprodvar[nl_resp_2021$entries <= 1] <- NA
nl_resp_2021$amountvarw1[nl_resp_2021$entriesw1 <= 1] <- NA 
nl_resp_2021$nprodvarw1[nl_resp_2021$entriesw1 <= 1] <- NA
nl_resp_2021$amountvarw2[nl_resp_2021$entriesw2 <= 1] <- NA 
nl_resp_2021$nprodvarw2[nl_resp_2021$entriesw2 <= 1] <- NA

# Remove from workspace 
rm(nl_transplus)
rm(nl_transplus_w1)
rm(nl_transplus_w2)

###### Merge ######
# Merge 2020 expenditure data with 2020 registry data 
nl_resp_2020 <- merge(nl_resp_2020, nl_pop_2020, by = "UserID", all.x = TRUE)
rm(nl_pop_2020) # remove from workspace 

###### Subset correct households ######
## 2021 
nl_resp_2021 <- subset(nl_resp_2021, insample == TRUE) # subset households that belong in sample
nl_resp_2021 <- nl_resp_2021[!(is.na(nl_resp_2021$Leeftijd)),] # remove households with missing registry data 
nl_registration_2021 <- subset(nl_resp_2021, registered == TRUE) # subset responding households
nl_active_2021 <- subset(nl_resp_2021, active == TRUE) # subset participating households 
nl_complete_2021 <- subset(nl_resp_2021, complete == TRUE) # subset complete households 



