SRM_replication_code

Packages, Functions, and Data

Packages

library(haven)
library(broom)
library(knitr)
library(DT)
library(psych)
library(sjstats)
library(pwr)
library(equate)
library(tidyverse)
library(furrr)
library(viridis)
library(Cairo)
library(ggrepel)
library(kableExtra)

Custom Functions

Substantive Functions

# -- MOC Interpolation -----------------------------------
# See section "Interval Quantity Questions" for the intervals_df structure

moc_interpolation <- function(intervals_df, alpha){
  intervals_df %>% 
    mutate(
      moc = case_when(
        upper_bound != Inf ~ (lower_bound + upper_bound)/2,
        upper_bound == Inf ~ lower_bound*(alpha/(alpha-1))
      )
    ) %>% select(moc) %>% 
    rename_with(~ paste0("alpha",alpha))
}


# -- OSE-RG ---------------------------------------------
# Wrapper function around the equate() function from the equate package

calculate_eq_equivalents <- function(freqtab_x, freqtab_y){
  eq <- equate(freqtab_x, freqtab_y, type = "equipercentile")
  eq$concordance$yx
}

# -- Open-Ended Question Aggregation --------------------

# The function takes the vector of open-ended responses, selects values within a quantity interval, and calculates the mean quantity
# Optionally, it truncates the open-ended responses first, be removing the top x values

aggregate_open_interval <- function(lower_bound, upper_bound, v, trim_upper = 0){
  v<- v[!is.na(v)]
  v <- sort(v) 
  v <- v[1:floor(length(v)*(1-trim_upper))]
  v[v>=lower_bound & v <= upper_bound] %>% mean()
}

aggregate_open_interval_generalized <- function(lower_bound, upper_bound, v, f){
  v<- v[!is.na(v)]
  v[v>=lower_bound & v <= upper_bound] %>% f()
}


# -- Assymetric Trimmed Mean -------------------------------------
# Only removes upper cases 
# in contrast to the trim argument of base::mean()

mean_trim_upper <- function(x,trim_upper = 0.05) {
  x <- x[!is.na(x)]
  x <- sort(x) 
  mean(x[1:floor(length(x)*(1-trim_upper))])
}

Utility Functions

# Table styling shortcut

style_table <- function(df){
  df %>% 
  kable() %>% kable_styling(full_width = FALSE, position = "left") }

# Extracts recoding information from the harmonized_equivalents_df
get_rec_vector <- function(condition, approach, rec_df = harmonized_equivalents_df){
  rec_df %>% 
    filter(.data$condition == .env$condition) %>% 
    pull({{approach}})
}

Load data

source("SRM_replication_code_sub_population_analysis.R")
Analysis for sub population:  Full sampleAnalysis for sub population:  FemaleAnalysis for sub population:  MaleAnalysis for sub population:  30 to 59Analysis for sub population:  60 and olderAnalysis for sub population:  Under 30Analysis for sub population:  (Fach-)HochschulreifeAnalysis for sub population:  no (Fach-)HochschulreifeAnalysis for sub population:  WestAnalysis for sub population:  East
full_pop_results <- sub_pop_format %>% filter(sub_pop == "Full sample")
sub_pop_results <- sub_pop_format %>% filter(sub_pop != "Full sample")

data<- read_sav("SRM_replication_data.sav")

# Dataframe only with valid answers to the quantity question:
valid_df <- data %>% 
  mutate(any_valid = coalesce(books_low_freq, books_med_freq, books_hi_freq, books_numeric)) %>% 
  drop_na(any_valid)

Descriptives

Sample size

total_cases <- data %>% nrow()
valid_cases <- valid_df %>% nrow()

The dataset contains 3497 cases in total. Of those, 3484 cases have valid answers to the quantity question.

Demography

Sex

valid_df %>% 
  group_by(sex = D01 %>% as_factor) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  mutate(percent = 100*n/sum(n),
         percent = round(percent, 2)) %>% style_table()
sex n percent
männlich 1760 50.52
weiblich 1721 49.40
divers 3 0.09

Age

valid_df %>% 
  mutate(age = 2020-D02) %>% 
  summarise(across(age, list(mean=mean, median=median, sd=sd,min=min,max=max))) %>% 
  t %>% style_table
age_mean 45.13749
age_median 46.00000
age_sd 14.98322
age_min 18.00000
age_max 87.00000

Education

valid_df %>% 
  group_by(education %>% as_factor) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  mutate(percent = 100*n/sum(n),
         percent = round(percent, 2)) %>% 
  style_table()
education %>% as_factor n percent
Schüler/-in 17 0.49
Von der Schule abgegangen ohne Abschluss 8 0.23
Polytechnische Oberschule DDR, Abschluss 8. oder 9. Klasse 34 0.98
Polytechnische Oberschule DDR, Abschluss 10. Klasse 229 6.57
Hauptschulabschluss, Volksschulabschluss 343 9.85
Realschulabschluss, Mittlere Reife 969 27.81
Fachhochschulreife 384 11.02
Abitur, allgemeine oder fachgebundene Hochschulreife 1500 43.05

Preliminary analyses and instrument information

Open ended quantity question percentiles

Full sample result

full_pop_results %>% select(quantiles_table) %>%
    pull() %>%
    .[[1]] %>%
    as_tibble() %>%
    style_table() %>% print
1% 5% 25% 50% 75% 95% 99%
0 2 30 80 200 1000 2570

Subsample results

1% 5% 25% 50% 75% 95% 99%
0 4.1 40 100 200 1000 2438
1% 5% 25% 50% 75% 95% 99%
0 0 20 56 200 800 2828
1% 5% 25% 50% 75% 95% 99%
0 2 30 100 200 1000 2000
1% 5% 25% 50% 75% 95% 99%
0 3.95 50 105 300 1305 4084
1% 5% 25% 50% 75% 95% 99%
0 1.35 20 40 100 500 1933
1% 5% 25% 50% 75% 95% 99%
0 5 43.75 100 283.5 1025 4000
1% 5% 25% 50% 75% 95% 99%
0 0 20 60 150 600 2000
1% 5% 25% 50% 75% 95% 99%
0 1.35 30 99 200 1000 2565
1% 5% 25% 50% 75% 95% 99%
0 2 30 60 200 1000 2036

Interval quantity questions

Full sample result

full_pop_results %>%
    select(histogram_question) %>%
    pull() %>%
    .[[1]] %>%
    print

Subsample results

Interval Quantity Question Boundaries

Table

Note that the data structure intervals_df also severs as an input into the moc_interpolation() function. This is because the MOC interpolation requires information about the verbatim interval boundaries from the different quantity measurement instruments.

intervals_df <- tribble(
  ~condition, ~lower_bound, ~upper_bound,
  "low quantity", 0, 10,
  "low quantity", 11, 25,
  "low quantity", 26, 50,
  "low quantity", 51, Inf,
  "medium quantity", 0, 25,
  "medium quantity", 26, 50,
  "medium quantity", 51, 100,
  "medium quantity", 101, Inf,
  "high quantity", 0, 50,
  "high quantity", 51, 100,
  "high quantity", 101, 250,
  "high quantity", 251, Inf
) %>% 
  mutate(
    condition = factor(condition, levels = c("low quantity", "medium quantity", "high quantity")) %>% fct_rev
  )

intervals_df %>% style_table()
condition lower_bound upper_bound
low quantity 0 10
low quantity 11 25
low quantity 26 50
low quantity 51 Inf
medium quantity 0 25
medium quantity 26 50
medium quantity 51 100
medium quantity 101 Inf
high quantity 0 50
high quantity 51 100
high quantity 101 250
high quantity 251 Inf

Plot

intervals_df %>% 
  group_by(condition) %>% 
  mutate(row = row_number(),
         condition_num = condition %>% as.numeric,
    upper_bound_label = ifelse(upper_bound != Inf, upper_bound, NA))%>% 
  ggplot()+
  annotate(geom = "segment",
           x = 50, xend = 50, y = 0.65, yend = 3.45,
           size = 14, color = "#aaaaaa", lineend = "round")+
  annotate(geom = "segment",
           x = 50, xend = 50, y = 0.65, yend = 3.45,
           size = 12, color = "#eeeeee", lineend = "round")+
  geom_rect(aes(ymin=condition_num-0.3, ymax=condition_num+0.3, 
                xmin=lower_bound, xmax = upper_bound,
                fill=row))+
  geom_text(aes(0, condition_num, label = condition), hjust = 1.2, size = 4)+
  geom_text(aes(upper_bound_label, condition_num+0.4, label = upper_bound_label), size = 4)+
  geom_text(aes(310, condition_num+0.41), label = expression(phantom(0)%->% ~~infinity), size = 4)+
  scale_x_continuous(limits = c(-100, 310))+
  scale_fill_viridis(option = "mako", end = 0.7, begin = 0.3)+
  theme_void()+
  theme(
    legend.position = "none",
    plot.background = element_rect(color = "white", fill="white")
  )

Response bias demonstration

Descriptive

Note how the share of respondents reporting to own 50 books or less depends on the way the interval question.

Full sample result

valid_df %>% 
  mutate(
    low_50_or_less = ifelse(books_low_freq <=3, TRUE, FALSE),
    med_50_or_less = ifelse(books_med_freq <=2, TRUE, FALSE),
    hi_50_or_less = ifelse(books_hi_freq <=1, TRUE, FALSE)
  ) %>% 
  select(contains("_50_or_less")) %>% 
  pivot_longer(everything(), names_to = "condition", values_to = "50_or_less") %>% 
  drop_na(`50_or_less`) %>% 
  group_by(condition) %>% 
  summarise(Percent_50_or_less = mean(`50_or_less`)*100) %>% 
  mutate(condition = factor(condition, 
                            levels = c("low_50_or_less", 
                                       "med_50_or_less", 
                                       "hi_50_or_less"),
                            labels = c("low quantity",
                                       "medium quantity",
                                       "high quantity"),
                            ordered = TRUE)
         ) %>% arrange(condition) %>% style_table()
condition Percent_50_or_less
low quantity 44.82759
medium quantity 40.89888
high quantity 33.29480

Subsample results

condition Percent_50_or_less
low quantity 39.47991
medium quantity 33.25740
high quantity 29.27400
condition Percent_50_or_less
low quantity 50.23923
medium quantity 48.10690
high quantity 37.21461
condition Percent_50_or_less
low quantity 44.97154
medium quantity 39.43396
high quantity 33.75681
condition Percent_50_or_less
low quantity 27.45098
medium quantity 32.35294
high quantity 25.67568
condition Percent_50_or_less
low quantity 60.86957
medium quantity 52.12766
high quantity 38.55422
condition Percent_50_or_less
low quantity 35.45706
medium quantity 30.46875
high quantity 22.16359
condition Percent_50_or_less
low quantity 51.87500
medium quantity 48.61111
high quantity 41.97531
condition Percent_50_or_less
low quantity 46.26168
medium quantity 39.85401
high quantity 31.71806
condition Percent_50_or_less
low quantity 40.20101
medium quantity 43.84236
high quantity 39.13043

Spearman Test

Full sample result

spearman_df<- valid_df %>% 
  mutate(
    low_50_or_less = ifelse(books_low_freq <=3, TRUE, FALSE),
    med_50_or_less = ifelse(books_low_freq <=2, TRUE, FALSE),
    hi_50_or_less = ifelse(books_low_freq <=1, TRUE, FALSE)
  ) %>% 
  select(contains("_50_or_less")) %>% 
  pivot_longer(everything(), names_to = "condition", values_to = "50_or_less") %>% 
  drop_na(`50_or_less`) %>% 
  mutate(more_than_50 = as.numeric(!`50_or_less`))%>% 
  mutate(condition = factor(condition, levels = c("low_50_or_less", 
                                                  "med_50_or_less", 
                                                  "hi_50_or_less"),
                            ordered = TRUE),
         condition_num = as.numeric(condition))

cor.test(spearman_df$condition_num, spearman_df$more_than_50, method = "spearman") %>% tidy %>% 
  mutate(across(where(is.numeric), round, 3)) %>% t %>% style_table()
estimate 0.315
statistic 1832697214
p.value 0
method Spearman's rank correlation rho
alternative two.sided

Subsample results

estimate 0.306
statistic 236342918
p.value 0
method Spearman’s rank correlation rho
alternative two.sided
estimate 0.327
statistic 221252364
p.value 0
method Spearman’s rank correlation rho
alternative two.sided
estimate 0.301
statistic 460699775
p.value 0
method Spearman’s rank correlation rho
alternative two.sided
estimate 0.227
statistic 12451207
p.value 0
method Spearman’s rank correlation rho
alternative two.sided
estimate 0.443
statistic 10457672
p.value 0
method Spearman’s rank correlation rho
alternative two.sided
estimate 0.333
statistic 141231856
p.value 0
method Spearman’s rank correlation rho
alternative two.sided
estimate 0.313
statistic 341665745
p.value 0
method Spearman’s rank correlation rho
alternative two.sided
estimate 0.318
statistic 811625604
p.value 0
method Spearman’s rank correlation rho
alternative two.sided
estimate 0.307
statistic 24590562
p.value 0
method Spearman’s rank correlation rho
alternative two.sided

Sensitivity of MOC to different Alpha values

intervals_df %>% 
  add_column(
    map_dfc(seq(1,3,length.out = 21), ~moc_interpolation(intervals_df, .x))
  ) %>% 
  pivot_longer(starts_with("alpha"), names_to = "alpha", values_to = "moc_equivalents") %>% 
  mutate(alpha = str_remove(alpha, "alpha") %>% as.numeric()) %>% 
  filter(upper_bound==Inf,
         alpha != 1) %>% 
  mutate(response_label = paste0("More than ", lower_bound)) %>% 
  ggplot(aes(x=alpha, y=moc_equivalents))+
  facet_grid(rows = vars(condition), scales = "free")+
  geom_point(aes(color=alpha), size =4)+
  geom_text(aes(color=alpha, label = round(moc_equivalents, 0)), angle = 45, hjust = -0.2, size = 6)+
  geom_text(aes(x=Inf, y=Inf, label = response_label), 
            stat = "unique", hjust = 1.05, vjust = 1.3, size = 6)+
  scale_y_continuous("MOC interp. of highest interval", limits = c(0, 3600))+
  scale_x_continuous(breaks = c(1.1, 1.5, 2.0, 2.5, 3.0))+
  theme_bw(base_size = 14)+
  theme(
    legend.position = "none",
    plot.background = element_rect(colour = "white", fill="white")
  )+coord_cartesian(clip = FALSE)

Main Analyses

All interval conditions harmonized towards the open ended format

# Preparing Frequency tables for OSE-RG

freqtab_low <- freqtab(data$books_low_freq)
freqtab_med <- freqtab(data$books_med_freq)
freqtab_hi <- freqtab(data$books_hi_freq)
freqtab_open <- freqtab(data$books_numeric)

# Extracting the vector of open responses
open_vector <- data$books_numeric

# Calculate a table with equivalent values for every interval response option

harmonized_equivalents_df <- intervals_df %>%
  # Calculating MOC interpolated values
  add_column(
    moc_interpolation(intervals_df, alpha = 2)
  ) %>% 
  # Calculating OSE-RG Equivalents
  mutate(
    equip_equating = c(
      calculate_eq_equivalents(freqtab_low, freqtab_open),
      calculate_eq_equivalents(freqtab_med, freqtab_open),
      calculate_eq_equivalents(freqtab_hi, freqtab_open)
    )) %>% 
  # Calculate average open-ended values within each interval
  mutate(
    # All open-ended responses
    open_mean_quantity = map2_dbl(lower_bound, upper_bound,
                                  ~aggregate_open_interval(.x, .y, open_vector)),
    # Trimmed by removing the top 5% open-ended responses
    open_mean_quantity_trimmed = map2_dbl(lower_bound, upper_bound,
                                          ~aggregate_open_interval(.x, .y, open_vector, trim_upper = 0.05)),
  open_median_quantity = map2_dbl(lower_bound, upper_bound, 
                                ~aggregate_open_interval_generalized(.x, .y, open_vector, median))
  ) %>% 
  # Adding numerical interval IDs and interval labels
  group_by(condition) %>% 
  mutate(
    interval_number = row_number(),
    interval_label = paste0("[",lower_bound, ", ",upper_bound,"]")
    )

Plot equivalent values Interval -> Open Ended

Full sample result

harmonized_equivalents_df %>% 
  mutate(interval_number = (-1*(interval_number-1))+4,
    interval_label = paste0(interval_number, " - ", interval_label),
         interval_label = factor(interval_label) %>% fct_rev) %>% 
  pivot_longer(alpha2:open_median_quantity, values_to = "equivalents", names_to = "approach") %>% 
  mutate(
    condition = fct_rev(condition),
    approach = factor(approach,
                      levels = c(
                        "alpha2",
                        "equip_equating",
                        "open_mean_quantity",
                        "open_mean_quantity_trimmed",
                        "open_median_quantity"
                      ),
                      labels = c(
                        "MOC",
                        "OSE-RG",
                        "Open Q. (untrimmed)",
                        "Open Q. (trimmed)",
                        "Open Q. (median)"
                      ))
  ) %>% 
  ggplot(aes(equivalents, interval_label, color=approach, shape = approach, linetype = approach))+
  facet_wrap(vars(condition), ncol = 1, scales = "free")+
  geom_line(aes(group = approach), size = 1)+
  geom_point(size = 4, stroke = 2)+
  #geom_text(aes(0, label = interval_label, color = NA), hjust = 1, stat = "unique")+
  scale_color_manual(values = c("#497593", "orange", "#00ABA0", "#00ABA0", "#00ABA0"))+
  scale_linetype_manual(values = c(NA, NA, "solid","dashed", "dotted"))+
  scale_shape_manual(values = c(3,4,NA, NA, NA))+
  scale_x_continuous("Reported books")+
  scale_y_discrete("")+
  theme_bw(base_size = 20)+
  coord_flip()+
  #guides(color=guide_legend(ncol=2, byrow = TRUE))+
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    strip.background = element_blank(),
    #legend.position = "bottom",
    legend.title = element_blank()
  )

Subsample results

Values underlying the plot

Full sample results

harmonized_equivalents_df %>%  
  select(
    condition,
    interval = interval_label,
    MOC = alpha2,
    `OSE-RG`=equip_equating,
    `Open mean (untrimmed)` = open_mean_quantity,
    `Open mean (trimmed)` = open_mean_quantity_trimmed,
  ) %>% 
  mutate(across(where(is.numeric), round, 1)) %>% 
  style_table
condition interval MOC OSE-RG Open mean (untrimmed) Open mean (trimmed)
low quantity [0, 10] 5.0 1.8 4.3 4.3
low quantity [11, 25] 18.0 15.7 19.0 19.0
low quantity [26, 50] 38.0 49.5 41.8 41.8
low quantity [51, Inf] 102.0 199.7 396.1 231.5
medium quantity [0, 25] 12.5 7.1 9.8 9.8
medium quantity [26, 50] 38.0 39.7 41.8 41.8
medium quantity [51, 100] 75.5 99.5 88.2 88.2
medium quantity [101, Inf] 202.0 299.7 549.1 313.9
high quantity [0, 50] 25.0 15.1 24.9 24.9
high quantity [51, 100] 75.5 59.3 88.2 88.2
high quantity [101, 250] 175.5 149.8 180.6 180.6
high quantity [251, Inf] 502.0 499.7 875.8 472.6

Subsample results

condition interval MOC OSE-RG Open mean (untrimmed) Open mean (trimmed)
low quantity [0, 10] 5.0 2.4 4.5 4.5
low quantity [11, 25] 18.0 20.2 18.9 18.9
low quantity [26, 50] 38.0 49.7 42.6 42.6
low quantity [51, Inf] 102.0 189.8 386.6 239.0
medium quantity [0, 25] 12.5 9.0 9.9 9.9
medium quantity [26, 50] 38.0 40.0 42.6 42.6
medium quantity [51, 100] 75.5 79.7 88.0 88.0
medium quantity [101, Inf] 202.0 250.2 558.7 339.1
high quantity [0, 50] 25.0 20.3 28.6 28.6
high quantity [51, 100] 75.5 60.1 88.0 88.0
high quantity [101, 250] 175.5 120.4 175.7 175.7
high quantity [251, Inf] 502.0 499.6 875.9 516.3
condition interval MOC OSE-RG Open mean (untrimmed) Open mean (trimmed)
low quantity [0, 10] 5.0 1.1 4.2 4.2
low quantity [11, 25] 18.0 14.7 19.1 19.1
low quantity [26, 50] 38.0 39.8 41.0 41.0
low quantity [51, Inf] 102.0 200.0 408.1 224.8
medium quantity [0, 25] 12.5 5.5 9.7 9.7
medium quantity [26, 50] 38.0 35.0 41.0 41.0
medium quantity [51, 100] 75.5 99.8 88.6 88.6
medium quantity [101, Inf] 202.0 300.0 541.1 291.0
high quantity [0, 50] 25.0 10.4 22.1 22.1
high quantity [51, 100] 75.5 50.3 88.6 88.6
high quantity [101, 250] 175.5 150.2 185.1 185.1
high quantity [251, Inf] 502.0 499.9 875.6 428.5
condition interval MOC OSE-RG Open mean (untrimmed) Open mean (trimmed)
low quantity [0, 10] 5.0 2.2 3.9 3.9
low quantity [11, 25] 18.0 19.7 19.2 19.2
low quantity [26, 50] 38.0 49.7 43.0 43.0
low quantity [51, Inf] 102.0 199.7 358.3 225.6
medium quantity [0, 25] 12.5 5.0 9.6 9.6
medium quantity [26, 50] 38.0 39.8 43.0 43.0
medium quantity [51, 100] 75.5 99.5 90.5 90.5
medium quantity [101, Inf] 202.0 299.6 493.6 304.7
high quantity [0, 50] 25.0 15.1 25.0 25.0
high quantity [51, 100] 75.5 59.5 90.5 90.5
high quantity [101, 250] 175.5 149.8 178.9 178.9
high quantity [251, Inf] 502.0 499.5 760.9 448.1
condition interval MOC OSE-RG Open mean (untrimmed) Open mean (trimmed)
low quantity [0, 10] 5.0 0.3 4.3 4.3
low quantity [11, 25] 18.0 14.5 19.5 19.5
low quantity [26, 50] 38.0 34.7 41.9 41.9
low quantity [51, Inf] 102.0 199.9 486.6 280.9
medium quantity [0, 25] 12.5 10.1 9.9 9.9
medium quantity [26, 50] 38.0 49.9 41.9 41.9
medium quantity [51, 100] 75.5 99.8 85.5 85.5
medium quantity [101, Inf] 202.0 300.3 652.1 370.7
high quantity [0, 50] 25.0 20.0 25.0 25.0
high quantity [51, 100] 75.5 75.0 85.5 85.5
high quantity [101, 250] 175.5 139.6 182.6 182.6
high quantity [251, Inf] 502.0 499.6 1041.8 563.1
condition interval MOC OSE-RG Open mean (untrimmed) Open mean (trimmed)
low quantity [0, 10] 5.0 0.9 5.2 5.2
low quantity [11, 25] 18.0 14.0 18.3 18.3
low quantity [26, 50] 38.0 38.3 39.3 39.3
low quantity [51, Inf] 102.0 149.6 379.3 162.9
medium quantity [0, 25] 12.5 9.6 10.1 10.1
medium quantity [26, 50] 38.0 30.4 39.3 39.3
medium quantity [51, 100] 75.5 59.7 83.6 83.6
medium quantity [101, Inf] 202.0 200.4 581.7 234.0
high quantity [0, 50] 25.0 10.4 24.4 24.4
high quantity [51, 100] 75.5 49.6 83.6 83.6
high quantity [101, 250] 175.5 121.6 182.7 182.7
high quantity [251, Inf] 502.0 500.5 1074.5 368.8
condition interval MOC OSE-RG Open mean (untrimmed) Open mean (trimmed)
low quantity [0, 10] 5.0 0.8 4.6 4.6
low quantity [11, 25] 18.0 14.9 17.8 17.8
low quantity [26, 50] 38.0 45.3 42.4 42.4
low quantity [51, Inf] 102.0 200.0 464.3 282.0
medium quantity [0, 25] 12.5 6.8 9.8 9.8
medium quantity [26, 50] 38.0 35.5 42.4 42.4
medium quantity [51, 100] 75.5 79.6 89.5 89.5
medium quantity [101, Inf] 202.0 299.8 614.7 369.2
high quantity [0, 50] 25.0 18.2 30.2 30.2
high quantity [51, 100] 75.5 50.1 89.5 89.5
high quantity [101, 250] 175.5 110.3 182.6 182.6
high quantity [251, Inf] 502.0 499.6 942.2 546.0
condition interval MOC OSE-RG Open mean (untrimmed) Open mean (trimmed)
low quantity [0, 10] 5.0 2.2 4.2 4.2
low quantity [11, 25] 18.0 19.6 19.5 19.5
low quantity [26, 50] 38.0 49.6 41.3 41.3
low quantity [51, Inf] 102.0 189.5 335.9 192.8
medium quantity [0, 25] 12.5 8.5 9.8 9.8
medium quantity [26, 50] 38.0 40.1 41.3 41.3
medium quantity [51, 100] 75.5 99.7 87.3 87.3
medium quantity [101, Inf] 202.0 279.9 484.5 267.8
high quantity [0, 50] 25.0 14.9 21.8 21.8
high quantity [51, 100] 75.5 75.0 87.3 87.3
high quantity [101, 250] 175.5 150.4 178.6 178.6
high quantity [251, Inf] 502.0 499.9 797.8 402.3
condition interval MOC OSE-RG Open mean (untrimmed) Open mean (trimmed)
low quantity [0, 10] 5.0 2.0 4.3 4.3
low quantity [11, 25] 18.0 19.6 19.0 19.0
low quantity [26, 50] 38.0 49.7 41.5 41.5
low quantity [51, Inf] 102.0 199.9 389.5 229.6
medium quantity [0, 25] 12.5 5.6 9.6 9.6
medium quantity [26, 50] 38.0 37.9 41.5 41.5
medium quantity [51, 100] 75.5 99.6 89.0 89.0
medium quantity [101, Inf] 202.0 299.7 543.2 312.8
high quantity [0, 50] 25.0 14.7 24.5 24.5
high quantity [51, 100] 75.5 54.8 89.0 89.0
high quantity [101, 250] 175.5 140.0 181.5 181.5
high quantity [251, Inf] 502.0 455.2 857.8 465.7
condition interval MOC OSE-RG Open mean (untrimmed) Open mean (trimmed)
low quantity [0, 10] 5.0 1.2 4.4 4.4
low quantity [11, 25] 18.0 13.0 19.0 19.0
low quantity [26, 50] 38.0 35.2 42.6 42.6
low quantity [51, Inf] 102.0 150.2 422.2 238.9
medium quantity [0, 25] 12.5 9.9 10.5 10.5
medium quantity [26, 50] 38.0 44.6 42.6 42.6
medium quantity [51, 100] 75.5 75.4 84.9 84.9
medium quantity [101, Inf] 202.0 250.3 574.7 319.6
high quantity [0, 50] 25.0 19.9 26.0 26.0
high quantity [51, 100] 75.5 60.1 84.9 84.9
high quantity [101, 250] 175.5 150.4 176.9 176.9
high quantity [251, Inf] 502.0 599.8 941.1 498.0

Harmonization Performance

Aggregated Mean bias

Data preparation

# Here, the average quantities are calculated for untrimmed data

aggregated_book_estimations_df <- data %>%  
  select(`low quantity` = books_low_freq,
         `medium quantity` = books_med_freq,
         `high quantity` = books_hi_freq,
         `open question` = books_numeric) %>% 
  # Recode response data with the equivalent values calculated earlier
  transmute(`open question` = `open question`,
            # Recode all conditions via MOC
            across(`low quantity`:`high quantity`, 
                   ~get_rec_vector(cur_column(), "alpha2")[.x], 
                   .names = "{.col}_MOC"),
            # Recode all conditions via OSE-RG
            across(`low quantity`:`high quantity`, 
                   ~get_rec_vector(cur_column(), "equip_equating")[.x], 
                   .names = "{.col}_OSE-RG"),
  ) %>% 
  # calculate mean book quantity for every harmonization approach and condition
  summarise(across(everything(), mean, na.rm=TRUE)) %>% 
  # Wide to long format
  pivot_longer(contains("quantity")) %>% 
  # Calculate differences from the open-ended question expectation
  mutate(
    delta_mean = value - `open question`
  ) %>% 
  # tidy up table
  separate(
    name, into = c("condition", "approach"), sep = "_"
  ) %>% 
  mutate(condition = factor(condition,
                            levels = c("low quantity", "medium quantity", "high quantity")),
         trim = "untrimmed")


# Here, the same is done as above, but this time with trimmed data

aggregated_book_estimations_trimmed_df <- data %>% 
  select(`low quantity` = books_low_freq,
         `medium quantity` = books_med_freq,
         `high quantity` = books_hi_freq,
         `open question` = books_numeric) %>% 
  transmute(`open question` = `open question`,
            across(`low quantity`:`high quantity`, 
                   ~get_rec_vector(cur_column(), "alpha2")[.x], .names = "{.col}_MOC"),
            across(`low quantity`:`high quantity`, 
                   ~get_rec_vector(cur_column(), "equip_equating")[.x], .names = "{.col}_OSE-RG"),
  ) %>% 
  summarise(across(everything(), mean_trim_upper, trim_upper = 0.05)) %>% 
  pivot_longer(contains("quantity")) %>% 
  mutate(
    delta_mean = value - `open question`
  ) %>% 
  separate(
    name, into = c("condition", "approach"), sep = "_"
  ) %>% 
  mutate(condition = factor(condition,
                            levels = c("low quantity", "medium quantity", "high quantity")),
         trim = "trimmed 5%")

Plot

Full sample result
# Untrimmed and trimmed means are combined and then plotted

aggregated_book_estimations_df %>% 
  add_row(aggregated_book_estimations_trimmed_df) %>% 
  mutate(delta_mean_percent = 100*(delta_mean / `open question`),
    trim = factor(trim,
                       levels = c("untrimmed", "trimmed 5%")),
         condition = fct_rev(condition),
         label_hjust = ifelse(approach == "MOC",
                              delta_mean - lead(delta_mean, 3) < 1,
                              delta_mean - lag(delta_mean, 3) < 1)
         )%>%  
  ggplot(aes(delta_mean_percent, as.integer(condition), color = approach, group = approach))+
  facet_grid(vars(trim))+
  geom_vline(aes(xintercept=0), color = "#666666", size = 1)+
  geom_smooth(method = "lm", se= FALSE, fullrange=TRUE, size = 2)+
  geom_smooth(method = "lm", se= FALSE, fullrange=TRUE, color = "#ffffffaa", size = 2)+
  geom_label(aes(label=paste0(" ", round(delta_mean_percent), "% "), hjust = label_hjust),
             size = 4, alpha = 0.7, label.size = 1, show.legend = FALSE)+
  geom_point(size = 3)+
  scale_x_continuous("Average quantity difference to open question (%)", limits = c(-500, 100))+
  scale_y_continuous("", labels = aggregated_book_estimations_df$condition %>%fct_rev %>%  levels(), breaks = c(1,2,3))+
  coord_cartesian(ylim = c(0.5,3.5),
                  xlim = c(-100,50))+
  scale_color_manual(values = c("#497593", "orange"))+
  theme_bw(base_size = 12)+
  theme(
    panel.grid.minor = element_blank()
  )

Subsample results

Interval to interval harmonization

# Calculating the equivalent scores

interval_interval_equivalents <- harmonized_equivalents_df %>% 
  select(lower_bound, upper_bound, interval_label, interval_number, alpha2) %>% 
  ungroup() %>% 
  mutate(
    equipercentile_towards_mid = c(
      calculate_eq_equivalents(freqtab_low, freqtab_med),
      calculate_eq_equivalents(freqtab_med, freqtab_med),
      calculate_eq_equivalents(freqtab_hi, freqtab_med)
    )
  )

Equivalents table

Full sample results

interval_interval_equivalents %>% 
  filter(condition != "medium quantity") %>% 
  transmute(condition, 
         interval_label, 
         `original score` = interval_number,
         `OSE-RG towards medium quantity` = equipercentile_towards_mid %>% round(2)) %>% 
  style_table()
condition interval_label original score OSE-RG towards medium quantity
low quantity [0, 10] 1 0.75
low quantity [11, 25] 2 1.33
low quantity [26, 50] 3 2.18
low quantity [51, Inf] 4 3.77
high quantity [0, 50] 1 1.29
high quantity [51, 100] 2 2.67
high quantity [101, 250] 3 3.62
high quantity [251, Inf] 4 4.21

Subsample results

condition interval_label original score OSE-RG towards medium quantity
low quantity [0, 10] 1 0.76
low quantity [11, 25] 2 1.43
low quantity [26, 50] 3 2.32
low quantity [51, Inf] 4 3.82
high quantity [0, 50] 1 1.44
high quantity [51, 100] 2 2.83
high quantity [101, 250] 3 3.66
high quantity [251, Inf] 4 4.20
condition interval_label original score OSE-RG towards medium quantity
low quantity [0, 10] 1 0.75
low quantity [11, 25] 2 1.27
low quantity [26, 50] 3 2.07
low quantity [51, Inf] 4 3.71
high quantity [0, 50] 1 1.20
high quantity [51, 100] 2 2.51
high quantity [101, 250] 3 3.57
high quantity [251, Inf] 4 4.21
condition interval_label original score OSE-RG towards medium quantity
low quantity [0, 10] 1 0.81
low quantity [11, 25] 2 1.44
low quantity [26, 50] 3 2.27
low quantity [51, Inf] 4 3.79
high quantity [0, 50] 1 1.35
high quantity [51, 100] 2 2.73
high quantity [101, 250] 3 3.64
high quantity [251, Inf] 4 4.22
condition interval_label original score OSE-RG towards medium quantity
low quantity [0, 10] 1 0.71
low quantity [11, 25] 2 1.07
low quantity [26, 50] 3 1.69
low quantity [51, Inf] 4 3.74
high quantity [0, 50] 1 1.25
high quantity [51, 100] 2 2.67
high quantity [101, 250] 3 3.53
high quantity [251, Inf] 4 4.10
condition interval_label original score OSE-RG towards medium quantity
low quantity [0, 10] 1 0.66
low quantity [11, 25] 2 1.24
low quantity [26, 50] 3 2.28
low quantity [51, Inf] 4 3.75
high quantity [0, 50] 1 1.17
high quantity [51, 100] 2 2.53
high quantity [101, 250] 3 3.70
high quantity [251, Inf] 4 4.34
condition interval_label original score OSE-RG towards medium quantity
low quantity [0, 10] 1 0.66
low quantity [11, 25] 2 1.32
low quantity [26, 50] 3 2.24
low quantity [51, Inf] 4 3.80
high quantity [0, 50] 1 1.43
high quantity [51, 100] 2 2.58
high quantity [101, 250] 3 3.54
high quantity [251, Inf] 4 4.16
condition interval_label original score OSE-RG towards medium quantity
low quantity [0, 10] 1 0.78
low quantity [11, 25] 2 1.32
low quantity [26, 50] 3 2.14
low quantity [51, Inf] 4 3.74
high quantity [0, 50] 1 1.24
high quantity [51, 100] 2 2.77
high quantity [101, 250] 3 3.72
high quantity [251, Inf] 4 4.26
condition interval_label original score OSE-RG towards medium quantity
low quantity [0, 10] 1 0.78
low quantity [11, 25] 2 1.41
low quantity [26, 50] 3 2.30
low quantity [51, Inf] 4 3.78
high quantity [0, 50] 1 1.29
high quantity [51, 100] 2 2.66
high quantity [101, 250] 3 3.58
high quantity [251, Inf] 4 4.18
condition interval_label original score OSE-RG towards medium quantity
low quantity [0, 10] 1 0.67
low quantity [11, 25] 2 1.09
low quantity [26, 50] 3 1.80
low quantity [51, Inf] 4 3.74
high quantity [0, 50] 1 1.29
high quantity [51, 100] 2 2.80
high quantity [101, 250] 3 3.78
high quantity [251, Inf] 4 4.30

Equivalents plot

Full sample results

interval_interval_equivalents %>% 
  mutate(condition = factor(condition),
         cond_num = as.integer(condition),
         interval_label = ifelse(str_detect(interval_label, "Inf"),
           paste0(str_remove(interval_label, "Inf]"), "\U221E)"),
           interval_label
         ))%>% 
  ggplot(aes(cond_num, equipercentile_towards_mid, color = interval_number))+
  geom_segment(aes(x = cond_num-0.4, xend = cond_num+0.4,
                   yend = equipercentile_towards_mid), size = 2, lineend = "round")+
  geom_label(aes(label = interval_label), label.padding = unit(0.5, "lines"), size = 6, label.size = 1)+
  scale_x_continuous("", labels = interval_interval_equivalents$condition %>% levels, breaks = c(1,2,3))+
  scale_y_continuous("OSE-RG equivalents in score format")+
  scale_color_viridis(option = "mako", end = 0.7, begin = 0.3)+
  theme_minimal(base_size = 12)+
  theme(
    legend.position = "none",
    plot.background = element_rect(color ="white", fill = "white")
  )

Subsample results

Interval to interval harmonization performance

# Calculating MOC equivalents
moc_low_med_hi <-intervals_df %>% 
  add_column(
    moc_interpolation(intervals_df, 2)
  ) %>% 
  pull(alpha2)

# Recoding data
interval_interval_data <-  data %>% 
  transmute(
    low_freq.raw = books_low_freq,
    med_freq.raw = books_med_freq,
    hi_freq.raw = books_hi_freq,
    # Apply OSE-RG to harmonize towards medium frequency condition
    low_freq.eq = calculate_eq_equivalents(freqtab_low, freqtab_med)[books_low_freq],
    med_freq.eq = calculate_eq_equivalents(freqtab_med, freqtab_med)[books_med_freq],
    hi_freq.eq = calculate_eq_equivalents(freqtab_hi, freqtab_med)[books_hi_freq],
    # Apply MOC to interpolate all conditions
    low_freq.moc = moc_low_med_hi[1:4][books_low_freq],
    med_freq.moc = moc_low_med_hi[5:8][books_med_freq],
    hi_freq.moc = moc_low_med_hi[9:12][books_hi_freq]
  )

# Caluclate 

interval_interval_aggregation <- interval_interval_data %>% 
  summarise(across(everything(), list(mean=mean, sd = sd), na.rm=TRUE, .names = "{.col}.{.fn}")) %>% 
  pivot_longer(everything()) %>% 
  separate(name, into = c("condition", "approach", "statistic"), sep = "\\.")

table

Full sample results

interval_interval_aggregation %>% 
  pivot_wider(names_from = statistic, values_from = value) %>% 
  mutate(
    mid_mean = mean[c(2,2,2,5,5,5,8,8,8)],
    mid_sd = sd[c(2,2,2,5,5,5,8,8,8)],
    d = (mean-mid_mean)/mid_sd,
    condition = factor(condition, levels = c(
      "low_freq",
      "med_freq",
      "hi_freq"),
      labels = c(
        "low quantity",
        "medium quantity",
        "high quantity"
      )) %>% fct_rev,
    approach = factor(approach,
                      levels = c("eq", "moc"),
                      labels = c("OSE-RG", "MOC")) %>% fct_rev()
    ) %>% 
  drop_na(approach) %>% 
  dplyr::relocate(approach) %>% 
  mutate(across(where(is.numeric), round, 2)) %>% 
  rename(
    `medium condition mean` = mid_mean,
    `medium condition sd` = mid_sd
  ) %>% kable() %>% kable_styling()
approach condition mean sd medium condition mean medium condition sd d
OSE-RG low quantity 2.79 1.16 2.76 1.17 0.03
OSE-RG medium quantity 2.76 1.17 2.76 1.17 0.00
OSE-RG high quantity 2.76 1.16 2.76 1.17 0.00
MOC low quantity 67.07 39.84 102.53 80.21 -0.44
MOC medium quantity 102.53 80.21 102.53 80.21 0.00
MOC high quantity 175.32 183.02 102.53 80.21 0.91

Sub sample results

approach condition mean sd medium condition mean medium condition sd d
OSE-RG low quantity 2.99 1.10 2.95 1.11 0.03
OSE-RG medium quantity 2.95 1.11 2.95 1.11 0.00
OSE-RG high quantity 2.96 1.09 2.95 1.11 0.01
MOC low quantity 71.53 38.69 114.98 79.95 -0.54
MOC medium quantity 114.98 79.95 114.98 79.95 0.00
MOC high quantity 194.37 191.29 114.98 79.95 0.99
approach condition mean sd medium condition mean medium condition sd d
OSE-RG low quantity 2.60 1.18 2.57 1.19 0.02
OSE-RG medium quantity 2.57 1.19 2.57 1.19 0.00
OSE-RG high quantity 2.57 1.19 2.57 1.19 0.00
MOC low quantity 62.56 40.53 90.65 78.78 -0.36
MOC medium quantity 90.65 78.78 90.65 78.78 0.00
MOC high quantity 156.75 172.78 90.65 78.78 0.84
approach condition mean sd medium condition mean medium condition sd d
OSE-RG low quantity 2.82 1.15 2.80 1.16 0.02
OSE-RG medium quantity 2.80 1.16 2.80 1.16 0.00
OSE-RG high quantity 2.80 1.15 2.80 1.16 0.00
MOC low quantity 66.63 40.26 104.83 80.21 -0.48
MOC medium quantity 104.83 80.21 104.83 80.21 0.00
MOC high quantity 174.98 182.37 104.83 80.21 0.87
approach condition mean sd medium condition mean medium condition sd d
OSE-RG low quantity 3.07 1.11 2.98 1.15 0.08
OSE-RG medium quantity 2.98 1.15 2.98 1.15 0.00
OSE-RG high quantity 2.99 1.15 2.98 1.15 0.01
MOC low quantity 81.01 35.05 119.29 81.47 -0.47
MOC medium quantity 119.29 81.47 119.29 81.47 0.00
MOC high quantity 242.33 211.50 119.29 81.47 1.51
approach condition mean sd medium condition mean medium condition sd d
OSE-RG low quantity 2.46 1.15 2.45 1.16 0.00
OSE-RG medium quantity 2.45 1.16 2.45 1.16 0.00
OSE-RG high quantity 2.45 1.15 2.45 1.16 0.00
MOC low quantity 55.25 38.87 81.60 75.05 -0.35
MOC medium quantity 81.60 75.05 81.60 75.05 0.00
MOC high quantity 116.70 131.15 81.60 75.05 0.47
approach condition mean sd medium condition mean medium condition sd d
OSE-RG low quantity 3.08 1.04 3.04 1.06 0.04
OSE-RG medium quantity 3.04 1.06 3.04 1.06 0.00
OSE-RG high quantity 3.07 1.04 3.04 1.06 0.03
MOC low quantity 75.65 36.33 119.33 78.78 -0.55
MOC medium quantity 119.33 78.78 119.33 78.78 0.00
MOC high quantity 223.22 195.17 119.33 78.78 1.32
approach condition mean sd medium condition mean medium condition sd d
OSE-RG low quantity 2.57 1.20 2.55 1.2 0.02
OSE-RG medium quantity 2.55 1.20 2.55 1.2 0.00
OSE-RG high quantity 2.54 1.20 2.55 1.2 -0.01
MOC low quantity 60.62 41.17 89.99 79.1 -0.37
MOC medium quantity 89.99 79.10 89.99 79.1 0.00
MOC high quantity 137.97 163.69 89.99 79.1 0.61
approach condition mean sd medium condition mean medium condition sd d
OSE-RG low quantity 2.80 1.14 2.77 1.15 0.02
OSE-RG medium quantity 2.77 1.15 2.77 1.15 0.00
OSE-RG high quantity 2.78 1.15 2.77 1.15 0.01
MOC low quantity 65.84 40.11 102.69 79.54 -0.46
MOC medium quantity 102.69 79.54 102.69 79.54 0.00
MOC high quantity 182.57 187.18 102.69 79.54 1.00
approach condition mean sd medium condition mean medium condition sd d
OSE-RG low quantity 2.78 1.21 2.71 1.22 0.06
OSE-RG medium quantity 2.71 1.22 2.71 1.22 0.00
OSE-RG high quantity 2.70 1.22 2.71 1.22 -0.01
MOC low quantity 71.04 38.81 102.63 82.78 -0.38
MOC medium quantity 102.63 82.78 102.63 82.78 0.00
MOC high quantity 148.50 164.44 102.63 82.78 0.55