facet_wrap(~ prompt, labeller=prompt_labeller, ncol = 4)
# Save
ggsave("plot_freq.svg", plot_freq, height = 6, width = 10)
# Save
ggsave("plot_freq.pdf", plot_freq, height = 6, width = 10)
# Save
ggsave("plot_freq.pdf", plot_freq, height = 6, width = 10)
data_performance <- data_llms %>%
select(doc_id, reason, check, ends_with("_en")) %>%
# Add row with code_human = "NA" so factor levels are the same
bind_rows(tibble(
doc_id = NA_real_,
reason = NA_character_,
code_human_en = "NA",
gpt_zs_en = NA_character_,
gpt_desc_en = NA_character_,
gpt_fs_en = NA_character_,
gpt_zs_2_en = NA_character_,
gpt_desc_2_en = NA_character_,
gpt_fs_2_en = NA_character_,
llama_zs_en = NA_character_,
llama_desc_en = NA_character_,
llama_fs_en = NA_character_,
llama_zs_2_en = NA_character_,
llama_desc_2_en = NA_character_,
llama_fs_2_en = NA_character_,
mistral_zs_en = NA_character_,
mistral_desc_en = NA_character_,
mistral_fs_en = NA_character_,
mistral_zs_2_en = NA_character_,
mistral_desc_2_en = NA_character_,
mistral_fs_2_en = NA_character_,
check = NA_real_
)) %>%
mutate(across(-c(doc_id, reason, check), as.factor)) %>%
filter(code_human_en != "NA") # remove row again but factor levels remain the same
# Select all columns starting with the LLM names
model_approach <- grep("^(gpt_|llama_|mistral_)", names(data_performance), value = TRUE)
# Create empty data frames for storing results
data_per_category_f1 <- data.frame()
data_macro_f1 <- data.frame(
source = character(), macro_f1 = numeric()
)
# Add dfs for weighted F1, accuracy, Cohen's kappa (Appendix)
data_weighted_f1 <- data.frame(
source = character(),
weighted_f1 = numeric()
)
data_performance_metrics <- data.frame()
# Calculate macro and per-category F1 Scores for each model/approach combination
for (model in model_approach) {
# extract performance metrics
performance <- caret::confusionMatrix(
data = data_performance[[model]],
reference = data_performance$code_human_en,
mode = "everything"
)
# confusion matrix
assign(paste0("cm_", model), performance$table)
# performance metrics for appendix: overall and per category
## overall
performance_metrics <- round(performance$overall, digits = 2)
performance_metrics <- data.frame(
metric = names(performance_metrics),
value = as.numeric(performance_metrics),
source = model
)
data_performance_metrics <- rbind(data_performance_metrics, performance_metrics)
## per category
assign(paste0("performance_", model, "_bycat"),
round(performance$byClass, digits = 2))
# Extract F1 scores by category
f1_bycat_raw <- performance$byClass[, "F1"]
categories_raw <- rownames(performance$byClass)  # Extract category names
# Remove NA category from macro F1 calculation and per-cat plot
categories_trimmed <- trimws(categories_raw)
valid_indices <- !stringr::str_detect(categories_trimmed, "^Class: NA$")
f1_bycat <- f1_bycat_raw[valid_indices]
categories <- categories_raw[valid_indices]
# Add to data_per_category_f1
per_cat_f1 <- data.frame(
category = categories,
f1_score = f1_bycat,
source = model,
row.names = NULL
) %>%
mutate(category = str_remove(category, "Class: "))
data_per_category_f1 <- rbind(data_per_category_f1, per_cat_f1)
# Calculate and add macro F1
macro_f1 <- mean(f1_bycat, na.rm = TRUE)
macro_f1 <- data.frame(
macro_f1 = macro_f1,
source = model
)
data_macro_f1 <- rbind(data_macro_f1, macro_f1)
# Calculate and add weighted F1
## Get the number of true instances (diagonal of confusion matrix) for valid categories
true_counts <- diag(performance$table)[valid_indices]
total_instances <- sum(performance$table)
## Calculate weights
weights <- true_counts / total_instances
## Calculate weighted F1
weighted_f1 <- sum(weights * f1_bycat, na.rm = TRUE)
## Add to data frame
weighted_f1 <- data.frame(
weighted_f1 = weighted_f1,
source = model
)
data_weighted_f1 <- rbind(data_weighted_f1, weighted_f1)
}
data_plot_macrof1 <- data_macro_f1 %>%
filter(!grepl("_2", source)) %>% # filter only first round for main analyses
mutate(prompt = case_when(
str_detect(source, "zs") ~ "zs",
str_detect(source, "desc") ~ "desc",
str_detect(source, "fs") ~ "fs",
str_detect(source, "ft") ~ "ft",
TRUE ~ NA_character_
),
prompt = factor(prompt, levels = c("zs", "desc", "fs",
"ft"
)),
source = str_remove(source, "_zs_en|_desc_en|_fs_en|_ft_en"),
source = factor(source, levels = c("gpt", "llama",
"mistral"
))
)
plot_macrof1 <- ggplot(data_plot_macrof1,
aes(x = source, y = fct_rev(prompt), fill= macro_f1)) +
geom_tile() +
scale_fill_viridis(limits = c(0,1)) +
scale_x_discrete(position = "top",
labels = c("GPT", "LLAMA",
"Mistral"
)
) +
scale_y_discrete(labels = c(
"fine-tuned zero-shot",
"few-shot", "zero-shot with description", "zero-shot")) + # reversed!
geom_text(aes(label = round(macro_f1, 2)), color = "white", size = 5) +
labs(#title = "Macro F1-Scores by LLM and Prompting Approach",
title = NULL,
x = NULL,
y = NULL,
fill = "Macro F1"
) +
theme_minimal()
# Save
ggsave("plot_macrof1.pdf", plot_macrof1, height = 4, width = 6)
data_plot_percatf1 <- data_per_category_f1 %>%
filter(!str_ends(source, "_2_en")) %>% # filter only first round for main analyses
mutate(prompt = case_when(
str_detect(source, "zs") ~ "zs",
str_detect(source, "desc") ~ "desc",
str_detect(source, "fs") ~ "fs",
str_detect(source, "ft") ~ "ft",
TRUE ~ NA_character_
),
prompt = factor(prompt,
levels = c("zs", "desc", "fs",
"ft"
)),
source = str_remove(source, "_zs_en|_desc_en|_fs_en|_ft_en"),
source = str_remove(source, "f1_"),
source = ifelse(source == "gpt", "GPT",
ifelse(source == "llama", "LLAMA",
ifelse(source == "mistral", "Mistral",
source))),
source = factor(source,
levels = c("GPT", "LLAMA",
"Mistral"
))) %>%
rename(code_en = category) %>%
mutate(code_en = factor(code_en,
levels = (human_sorted_categories)))
ggsave("figure_4.pdf", plot_freq, height = 6, width = 10)
ggsave("figure_2.pdf", plot_macrof1, height = 4, width = 6)
plot_percatf1 <- ggplot(data_plot_percatf1,
aes(x = source, y = fct_rev(prompt), fill= f1_score)) +
geom_tile() +
scale_fill_viridis(limits = c(0,1)) +
scale_x_discrete(position = "top",
labels = c("GPT", "LLAMA",
"Mistral"
)
) +
scale_y_discrete(labels = c(
"fine-tuned zero-shot",
"few-shot", "zero-shot with description", "zero-shot")) + # reversed!
geom_text(aes(label = round(f1_score, 2)), color = "white", size = 3) +
labs(#title = "Per-Category F1-Scores by LLM and Prompting Approach",
title = NULL,
x = NULL,
y = NULL,
fill = "F1"
) +
theme_minimal() +
facet_wrap(~ code_en, ncol = 4)
# Save
ggsave("figure_3.pdf", plot_percatf1, height = 6, width = 10)
data_icc <- data_performance %>%
select(-c(reason, check)) %>%
# transform to numeric values for calculation
mutate(across(.cols = everything(), as.numeric))
# Filter out the "code_human" column and find all unique pairs based on column naming
column_pairs <- names(data_icc) %>%
grep("_2_en$", ., value = TRUE) %>%
sub("_2_en$", "", .) %>%
unique()
# Create a function to compute ICC for a pair of columns
compute_icc <- function(df, col_base) {
col1 <- paste0(col_base, "_en")     # Base column (e.g., gpt_zs)
col2 <- paste0(col_base, "_2_en")   # Corresponding "_2" column
# Subset the data for the two columns
data_pair <- df %>% select(all_of(col1), all_of(col2)) %>%
na.omit() # Remove rows where either column has NA
# Check if sufficient rows remain for ICC calculation
if (nrow(data_pair) < 2) {
return(NA) # Return NA if not enough data points remain
}
# Compute ICC using irr:icc
icc_result <- icc(data_pair, model = "twoway", type = "agreement", unit = "single")
# Extract and return the ICC value
return(icc_result$value) # Extract the ICC value
}
# Compute ICC for all pairs
icc_results <- sapply(column_pairs, compute_icc, df = data_icc)
# Filter columns with "_zs", "_desc", "_fs", or "_ft"
llm_columns <- grep("_zs_en|_desc_en|_fs_en|_ft_en|_zs_2_en|_desc_2_en|_fs_2_en|_ft_2_en", names(data_performance), value = TRUE)
# Calculate the proportion of NAs
na_proportions <- data_performance %>%
group_by(code_human_en) %>%
mutate(across(all_of(llm_columns), ~ as.character(.x))) %>%
summarise(across(all_of(llm_columns), ~ mean(. == "NA", na.rm = TRUE), .names = "{col}"))
data_plot_nas <- na_proportions %>%
select(-ends_with("_2")) %>%
pivot_longer(cols = (-code_human_en),
names_to = "source",
names_prefix = "na_prop_",
values_to = "freq"
) %>%
mutate(prompt = case_when(
str_detect(source, "zs") ~ "zs",
str_detect(source, "desc") ~ "desc",
str_detect(source, "fs") ~ "fs",
str_detect(source, "ft") ~ "ft",
TRUE ~ NA_character_
),
source = case_when(
str_detect(source, "gpt") ~ "gpt",
str_detect(source, "llama") ~ "llama",
str_detect(source, "mistral") ~ "mistral",
TRUE ~ NA_character_
),
source = factor(source, levels = c("gpt", "llama",
"mistral"
)),
prompt = factor(prompt, levels = c("zs", "desc", "fs",
"ft"
)),
code_human_en = factor(code_human_en, levels = rev(human_sorted_categories))
)
plot_nas <- ggplot(data_plot_nas,
aes(x = code_human_en,
y = freq,
fill = fct_rev(source))
) +
geom_col(position = position_dodge(preserve = "single")) +
scale_fill_manual(values = viridis(4, direction = -1),
labels = c("Mistral",
"LLAMA", "GPT")) +
scale_y_continuous(labels = scales::percent_format(scale = 100)) +
labs(#title = "Proportion of Missing Classifications by Human-Assigned Category, LLM, and Prompting Approach",
title = NULL,
x = NULL,
y = "Missing Classifications",
fill = "LLM"
) +
guides(fill = guide_legend(reverse = TRUE)) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 8),
axis.title.y = element_text(size = 10),
panel.grid.major.y = element_blank()
) +
coord_flip() +
facet_wrap(~ prompt, labeller=prompt_labeller, ncol = 4)
# Save
ggsave("figure_a3.pdf", plot_nas, height = 6, width = 10)
data_llms_long_ax <- data_llms_long_ax %>%
mutate(na_reason = factor(na_reason,
levels = c('no change', 'no match', 'more than 1 match')),
llm = factor(llm,
levels = c('gpt', 'llama', 'mistral'),
labels = c("GPT", "LLAMA", "Mistral")),
prompt = factor(prompt, levels = c('zs', 'desc', 'fs', 'ft')))
facet_labels_prompt_nas <- c("zs" = "zero-shot", "desc" = "zero-shot w. description", "fs" = "few-shot", "ft" = "zero-shot with description")
facet_labels_llm_nas <- c("gpt" = "GPT", "llama" = "LLAMA", "mistral" = "Mistral")
data_plot_na_reasons <- data_llms_long_ax %>%
filter(na_reason != "no change") %>%
group_by(prompt, llm, na_reason) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(prompt, llm) %>%
mutate(total_count = sum(count),
relative_freq = count / total_count,
na_reason = factor(na_reason)) %>%
ungroup()
plot_na_reasons <- ggplot(data_plot_na_reasons,
aes(x=relative_freq,
y=fct_rev(llm))) +
geom_col(aes(fill = fct_rev(na_reason))) +
scale_x_continuous(name = "Missing Classifications",
labels = scales::percent_format(scale = 100)
) +
scale_y_discrete(name = NULL) +
scale_fill_manual(values = viridis(2,
direction = -1
),
breaks = levels(data_plot_na_reasons$na_reason), # Original factor levels
labels = c("no match",
"more than 1 match"
)) +
geom_text(aes(label = count),
position = position_stack(vjust = 0.5),
color = "white",
size = 5
) +
labs(#title = "Distribution of reasons for outputs classified as missing",
title = NULL,
fill = "Output") +
facet_grid(~prompt, labeller = labeller(prompt = facet_labels_prompt_nas,
llm = facet_labels_llm_nas)) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 8),
axis.title.y = element_text(size = 10),
panel.grid.major.y = element_blank(),
legend.position = "top"
)
# Save
ggsave("figure_a4.pdf", plot_na_reasons, height = 6, width = 8)
# Create empty data frames for storing results
data_per_category_f1_noNA <- data.frame()
data_macro_f1_noNA <- data.frame(
source = character(), macro_f1 = numeric()
)
# Add dfs for weighted F1, accuracy, Cohen's kappa (Appendix)
data_weighted_f1_noNA <- data.frame(
source = character(),
weighted_f1 = numeric()
)
data_performance_metrics_noNA <- data.frame()
# Calculate macro and per-category F1 Scores for each model/approach combination
for (model in model_approach) {
# Filter out rows where the model column has value "NA" (as a character)
filtered_data <- data_performance[data_performance[[model]] != "NA", ]
# Remove "NA" as a factor level
filtered_data[[model]] <- droplevels(filtered_data[[model]])
filtered_data$code_human_en <- droplevels(filtered_data$code_human_en)
# extract performance metrics
performance_noNA <- caret::confusionMatrix(
data = filtered_data[[model]],
reference = filtered_data$code_human_en,
mode = "everything"
)
# save confusion matrix
assign(paste0("cm_", model, "_noNA"), performance_noNA$table)
# performance metrics for appendix: overall and per category
## overall
performance_metrics_noNA <- round(performance_noNA$overall, digits = 2)
performance_metrics_noNA <- data.frame(
metric = names(performance_metrics_noNA),
value = as.numeric(performance_metrics_noNA),
source = model
)
data_performance_metrics_noNA <- rbind(data_performance_metrics_noNA, performance_metrics_noNA)
## per category
assign(paste0("performance_", model, "_bycat_noNA"),
round(performance_noNA$byClass, digits = 2))
# Extract F1 scores by category
f1_bycat_raw_noNA <- performance_noNA$byClass[, "F1"]
categories_raw_noNA <- rownames(performance_noNA$byClass)  # Extract category names
# Remove NA category from macro F1 calculation and per-cat plot
categories_trimmed_noNA <- trimws(categories_raw_noNA)
valid_indices_noNA <- !stringr::str_detect(categories_trimmed_noNA, "^Class: NA$")
f1_bycat_noNA <- f1_bycat_raw_noNA[valid_indices_noNA]
categories_noNA <- categories_raw_noNA[valid_indices_noNA]
# Add to data_per_category_f1
per_cat_f1_noNA <- data.frame(
category = categories_noNA,
f1_score = f1_bycat_noNA,
source = model,
row.names = NULL
) %>%
mutate(category = str_remove(category, "Class: "))
data_per_category_f1_noNA <- rbind(data_per_category_f1_noNA, per_cat_f1_noNA)
# Calculate and add macro F1
macro_f1_noNA <- mean(f1_bycat_noNA, na.rm = TRUE)
macro_f1_noNA <- data.frame(
macro_f1 = macro_f1_noNA,
source = model
)
data_macro_f1_noNA <- rbind(data_macro_f1_noNA, macro_f1_noNA)
# Calculate and add weighted F1
## Get the number of true instances (diagonal of confusion matrix) for valid categories
true_counts_noNA <- diag(performance_noNA$table)[valid_indices_noNA]
total_instances_noNA <- sum(performance_noNA$table)
## Calculate weights
weights_noNA <- true_counts_noNA / total_instances_noNA
## Calculate weighted F1
weighted_f1_noNA <- sum(weights_noNA * f1_bycat_noNA, na.rm = TRUE)
## Add to data frame
weighted_f1_noNA <- data.frame(
weighted_f1 = weighted_f1_noNA,
source = model
)
data_weighted_f1_noNA <- rbind(data_weighted_f1_noNA, weighted_f1_noNA)
}
data_performance_metrics_noNA <- data_performance_metrics_noNA %>%
pivot_wider(
names_from = metric,
values_from = value
) %>%
select(source, Accuracy, Kappa)
data_plot_macrof1_noNA <- data_macro_f1_noNA %>%
filter(!grepl("_2_en", source)) %>% # filter only first round for main analyses
mutate(prompt = case_when(
str_detect(source, "zs") ~ "zs",
str_detect(source, "desc") ~ "desc",
str_detect(source, "fs") ~ "fs",
str_detect(source, "ft") ~ "ft",
TRUE ~ NA_character_
),
prompt = factor(prompt, levels = c("zs", "desc", "fs",
"ft"
)),
source = str_remove(source, "_zs_en|_desc_en|_fs_en|_ft_en"),
source = factor(source, levels = c("gpt", "llama",
"mistral"
))
)
plot_macrof1_noNA <- ggplot(data_plot_macrof1_noNA,
aes(x = source, y = fct_rev(prompt), fill= macro_f1)) +
geom_tile() +
scale_fill_viridis(limits = c(0,1)) +
scale_x_discrete(position = "top",
labels = c("GPT", "LLAMA",
"Mistral"
)
) +
scale_y_discrete(labels = c(
"fine-tuned zero-shot",
"few-shot", "zero-shot with description", "zero-shot")) + # reversed!
geom_text(aes(label = round(macro_f1, 2)), color = "white", size = 5) +
labs(#title = "Macro F1-Scores by LLM and Prompting Approach (omitted NAs)",
title = NULL,
x = NULL,
y = NULL,
fill = "Macro F1"
) +
theme_minimal()
# Save
ggsave("figure_a5.pdf", plot_macrof1_noNA, height = 4, width = 6)
data_plot_percatf1_noNA <- data_per_category_f1_noNA %>%
filter(!str_ends(source, "_2_en")) %>% # filter only first round for main analyses
mutate(prompt = case_when(
str_detect(source, "zs") ~ "zs",
str_detect(source, "desc") ~ "desc",
str_detect(source, "fs") ~ "fs",
str_detect(source, "ft") ~ "ft",
TRUE ~ NA_character_
),
prompt = factor(prompt,
levels = c("zs", "desc", "fs",
"ft"
)),
source = str_remove(source, "_zs_en|_desc_en|_fs_en|_ft_en"),
source = str_remove(source, "f1_"),
source = ifelse(source == "gpt", "GPT",
ifelse(source == "llama", "LLAMA",
ifelse(source == "mistral", "Mistral",
source))),
source = factor(source,
levels = c("GPT", "LLAMA",
"Mistral"
))) %>%
rename(code_en = category) %>%
mutate(code_en = factor(code_en,
levels = (human_sorted_categories)))
plot_percatf1_noNA <- ggplot(data_plot_percatf1_noNA,
aes(x = source, y = fct_rev(prompt), fill= f1_score)) +
geom_tile() +
scale_fill_viridis(limits = c(0,1)) +
scale_x_discrete(position = "top",
labels = c("GPT", "LLAMA",
"Mistral"
)
) +
scale_y_discrete(labels = c(
"fine-tuned zero-shot",
"few-shot", "zero-shot with description", "zero-shot")) + # reversed!
geom_text(aes(label = round(f1_score, 2)), color = "white", size = 3) +
labs(#title = "Per-Category F1-Scores by LLM and Prompting Approach (omitted NAs)",
title = NULL,
x = NULL,
y = NULL,
fill = "F1"
) +
theme_minimal() +
facet_wrap(~ code_en, ncol = 4)
# Save
ggsave("figure_a6.pdf", plot_percatf1_noNA, height = 6, width = 10)
