All computations done in R # This script aims to illustrate how to identify IRPs and NIRPS in survey data by # reconstructing the steps taken by Sausen and Blasius for the PISA 2009 # Principal Survey Data. # Step 1: Download the Data and Syntax File from: https://www.oecd.org/pisa/pisaproducts/pisa2009database-downloadabledata.htm - "School Questionare Data" and the corresponding SPSS Syntax file; alternately you can use the txt.file) # Step 2: Execute Syntax File on Raw Data in SPSS. Save Results in "*.sav" file # Step 3: specify the path to the file in this script via variable "sav_file" # Step 4: specify the country-subset you want to get the distance matrix for via variable "i" # Step 5: Install required libraries (if not already installed) # install.packages("") # Step 5: Execute Script # load dataset ------------------------------------------------------------ library("foreign") library("stringdist") # Subset to country (names must match on of the Levels for Variable "CNT") i = "Slovenia" sav_file = "C:/Users/*****/Hamming/INT_SCQ09_DEC11.sav" dataset <- read.spss(sav_file, use.value.labels = TRUE, to.data.frame = TRUE) # Cleaning the Data -------------------------------------------------------- # remove nonrespondents (all cases with more than 20 % NAs) dataset <- dataset[which(rowMeans(!is.na(dataset)) > 0.2), ] # Select columns for transformation and control variables ----------------- # Select the questions you like to analyze into "ranges", all questions # are subsetted for the further concatenation into a single string. # # If you want to select the ranges by colum names instead of column numbers you # may use the following function on a equivalent list of start-stop vectors of # column names: # # ranges <- lapply(ranges_by_colname, function(x) match(x, colnames(dataset))) ranges = list( c(6, 20), # every question from "SC01Q01" to "SC02Q01" (15 variables in total) c(25, 26), # questions "SC04Q01" and "SC05Q01" c(41, 225) # every question from "SC11Q01" to "SC27Q01" (185 variables in total) ) # in addition to the variables for the following concatenation we also need # information that allows to identify the cases in relation to the original # dataset. For PISA these variables are the country and school-id. To ease the # transformation of the analyzed variables this information is stored seperately # and merged afterwards control_vars <- dataset[c("CNT","SCHOOLID")] # subset the ranges into a dataframe dataset <- as.data.frame(lapply(ranges, function(x) dataset[x[1]:x[2]])) # create the string ------------------------------------------------------- # Create a single column containing a string variable. This string consists of the # concatenated list of items you selected in the previous step. # An easy way to get rid of factor levels and other forms of encoding is to # convert the dataframe to a matrix of integers. This way, the labels are # replaced by their actual raw value. # # E.g. the levels of SC04Q01 are reduced from # ["Village","Small Town","Town","City","Large City"] # to # [1,2,3,4,5] dataset <- data.matrix(dataset) # Set "NA" to "X" (since we need ONE character per item / column, "NA"s would # cause a lot of problems in further analysis dataset[is.na(dataset)] <- "X" # the actual concatenation. Per row, every value is merged into one large # string. The "as.vector" is a wrapper for removing attributes, which can be # outsourced into a separe step as well by the following: # attributes(dataset) <- NULL concat_string <- as.vector(apply(dataset, 1, paste0, collapse = "")) # rejoin the string with the control variables dataset <- cbind(control_vars, concat_string) # compute the distance matrix --------------------------------------------- # subset dataset to country "i". This is just optional. However, # computing a distance matrix for a large sample might take a significant amount # of time since the number of computations is n(n-1)/2 dataset_nat <- dataset[dataset$CNT == i,] # subset the column of strings (again) into a new vector and associate each # concatenated string with the respective school-id. After computing the # distance matrix, this step allows us to easily identify the combination of # involved schools if a distance value is suspiciously small concat_string_nat <- dataset_nat$concat_string names(concat_string_nat) <- dataset_nat$SCHOOLID # compute distance matrix with the hamming method. This matrix is the basis # of all further analyses. By doing a "table(dist)" you can already # take a quick look on the frequency patterns dist <- stringdistmatrix(concat_string_nat, method = "hamming", useNames = "names") # generate the plots ------------------------------------------------------ # plotting the frequencies of the distance values for the subset (intentionally # not as histogram since outliers are better to identify if bin width = 1) plot(table(dist), main = paste("Absolute Freq of HDs for", i)) # generating the qq plot for the subset qqnorm(dist, main = paste("Normal Dist. Q-Q Plot for", i)) # adding a reference line based on the mean and std-dev. of the distance matrix abline(mean(dist), sd(dist), col="Red") # Identify minimal distance pairs # generate the plots ------------------------------------------------------ # plotting the frequencies of the distance values for the subset (intentionally # not as histogram since outliers are better to identify if bin width = 1) plot(table(dist), main = paste("Absolute Freq of HDs for", i)) # generating the qq plot for the subset qqnorm(dist, main = paste("Normal Dist. Q-Q Plot for", i)) # adding a reference line based on the mean and std-dev. of the distance matrix abline(mean(dist), sd(dist), col="Red") # Identify minimal distance pairs ----------------------------------------- # In this step it is demonstrated how to identify the most similar match for each # case. To do so, isolate the lowest value per column and look up the # resepective row-name which contains the school-id (see line 106). In a second # step, write these case-combinations and the corresponding HD value into a # new dataframe # transform the numeric vector of the "distance matrix" into an actual matrix # (This is necessary since R stores symmetric distance matrix as a special # vector with the class "dist". For more information see the documentation on # dist() ) dist.min <- as.matrix(dist) # removing zeroes in diagonale diag(dist.min) <- NA # write the name of each column into a vector (not strictly necessary since the # information is already contained in the column-names as well as the orginial # dataset and the names of concat_string_nat. However, this step makes it easier # to understand our further procedure) case_1 <- colnames(dist.min) # identify HD minvalue per column and push the actual value to new vector min_HD <- apply(dist.min[, ], 2, min, na.rm = TRUE) # identify the *row-wise position* of the HD min value per column in the matrix # and push to new vector. Remember we have pushed the school-id to the dimnames # of our matrix (line 106) case_2 <- apply(dist.min[, ], 2, function(x) names(which.min(x))) # Finally, merge the verctors in to a new dataframe df.dist.min <- as.data.frame(cbind(case_1, case_2, min_HD)) # plotting the frequencies of the *minimum* distance values for the subset plot(table(df.dist.min$min_HD), main = paste("Absolute Freq of HD Min for", i)) ___________________________________________________________________________________________________________________ ___________________________________________________________________________________________________________________ Data preprocessed in SPSS, only Hamming distance computed in R # This script is for importing the *preprocessed* SPSS Data # Applicable when concatenation has been done via SPSS Code # in file **** SPSS-SYNTAX count help=SC01Q01 to SC02Q01, SC04Q01, SC05Q01,SC11Q01 to SC27Q01 (MISSING). fre help. temporary. select if (help le 39 and country='705'). WRITE OUTFILE='C:/****/pisa2009/writeout.txt' /1 country (a9) schoolid (a10) SC01Q01 to SC02Q01, SC04Q01, SC05Q01,SC11Q01 to SC27Q01 (202f1.0). EXECUTE. # This script requires the packages "readr" and "stringdist" # Please ensure those packages are installed on your system # or install them via # install.packages("") library(readr) library(stringdist) # load dataset - make sure to adjust the variable "sav_file" to match the path of your spss-output file sav_file = "C:/****/pisa2009/writeout.txt" dataset_nat <- read_table2(sav_file, col_names = FALSE, col_types = "ccc") # rename the columns to match the structure of dataset_nat in Hammingdistance.R colnames(dataset_nat) <- c("CNT","SCHOOLID","concat_string") # necessary for generating the plots in Hammingdistance.R i = "Slovenia" # NEXT STEP: concat_string_nat <- dataset_nat$concat_string names(concat_string_nat) <- dataset_nat$SCHOOLID # compute distance matrix with the hamming method. This matrix is the basis # of all further analyses. By doing a "table(dist)" you can already # take a quick look on the frequency patterns dist <- stringdistmatrix(concat_string_nat, method = "hamming", useNames = "names") # generate the plots ------------------------------------------------------ # plotting the frequencies of the distance values for the subset (intentionally # not as histogram since outliers are better to identify if bin width = 1) plot(table(dist), main = paste("Absolute Freq of HDs for", i)) # generating the qq plot for the subset qqnorm(dist, main = paste("Normal Dist. Q-Q Plot for", i)) # adding a reference line based on the mean and std-dev. of the distance matrix abline(mean(dist), sd(dist), col="Red") # Identify minimal distance pairs # In this step it is demonstrated how to identify the most similar match for each # case. To do so, isolate the lowest value per column and look up the # resepective row-name which contains the school-id (see line 106). In a second # step, write these case-combinations and the corresponding HD value into a # new dataframe # transform the numeric vector of the "distance matrix" into an actual matrix # (This is necessary since R stores symmetric distance matrix as a special # vector with the class "dist". For more information see the documentation on # dist() ) dist.min <- as.matrix(dist) # removing zeroes in diagonale diag(dist.min) <- NA # write the name of each column into a vector (not strictly necessary since the # information is already contained in the column-names as well as the orginial # dataset and the names of concat_string_nat. However, this step makes it easier # to understand our further procedure) case_1 <- colnames(dist.min) # identify HD minvalue per column and push the actual value to new vector min_HD <- apply(dist.min[, ], 2, min, na.rm = TRUE) # identify the *row-wise position* of the HD min value per column in the matrix # and push to new vector. Remember we have pushed the school-id to the dimnames # of our matrix (line 106) case_2 <- apply(dist.min[, ], 2, function(x) names(which.min(x))) # Finally, merge the verctors in to a new dataframe df.dist.min <- as.data.frame(cbind(case_1, case_2, min_HD)) # plotting the frequencies of the *minimum* distance values for the subset plot(table(df.dist.min$min_HD), main = paste("Absolute Freq of HD Min for", i))