library(tidyverse)
library(targets)
library(yardstick)
library(broom)
library(patchwork)
library(scales)
library(here)
# Load data
# Need to use this withr thing because tar_read() and tar_load() need to see the
# _targets folder in the current directory, but this .Rmd file is in a subfolder
withr::with_dir(here::here(), {
source(tar_read(plot_funs))
canary_testing_lagged <- tar_read(panel_testing_lagged)
# Load big list of models
model_df <- tar_read(model_df) %>%
filter(str_detect(model, "baseline") | str_detect(model, "v2csreprss"))
# Load actual model objects
tar_load(c(m_pts_baseline_train, m_pts_v2csreprss_train,
m_pts_total_train, m_pts_advocacy_train,
m_pts_entry_train, m_pts_funding_train,
m_lhr_baseline_train, m_lhr_v2csreprss_train,
m_lhr_total_train, m_lhr_advocacy_train,
m_lhr_entry_train, m_lhr_funding_train))
})
pts_levels <- levels(canary_testing_lagged$PTS_factor)
# Returns a data frame of predicted probabilities with actual = 1
# when the response outcome happens
match_actual <- function(x, pred, actual) {
pred %>%
select(fitted = {{x}}) %>%
mutate(actual = as.numeric(actual == colnames(pred[x])),
plot_level = colnames(pred[x]))
}
E1a: NGO laws and political terror
seed_pts_prediction <- 5494 # From random.org
set.seed(seed_pts_prediction)
pred_pts_baseline <- predict(
m_pts_baseline_train,
newdata = select(canary_testing_lagged, -PTS_factor_lead1),
allow_new_levels = TRUE
) %>%
as_tibble() %>%
magrittr::set_colnames(pts_levels)
set.seed(seed_pts_prediction)
pred_pts_total <- predict(
m_pts_total_train,
newdata = select(canary_testing_lagged, -PTS_factor_lead1),
allow_new_levels = TRUE
) %>%
as_tibble() %>%
magrittr::set_colnames(pts_levels)
set.seed(seed_pts_prediction)
pred_pts_advocacy <- predict(
m_pts_advocacy_train,
newdata = select(canary_testing_lagged, -PTS_factor_lead1),
allow_new_levels = TRUE
) %>%
as_tibble() %>%
magrittr::set_colnames(pts_levels)
set.seed(seed_pts_prediction)
pred_pts_entry <- predict(
m_pts_entry_train,
newdata = select(canary_testing_lagged, -PTS_factor_lead1),
allow_new_levels = TRUE
) %>%
as_tibble() %>%
magrittr::set_colnames(pts_levels)
set.seed(seed_pts_prediction)
pred_pts_funding <- predict(
m_pts_funding_train,
newdata = select(canary_testing_lagged, -PTS_factor_lead1),
allow_new_levels = TRUE
) %>%
as_tibble() %>%
magrittr::set_colnames(pts_levels)
Separation plots
Baseline models
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_baseline),
FUN = match_actual,
pred = pred_pts_baseline,
actual = as.character(canary_testing_lagged$PTS_factor_lead1))
plot_data <- pred_long_list %>%
bind_rows() %>%
filter(!is.nan(fitted), !is.na(actual)) %>%
group_by(plot_level) %>%
arrange(plot_level, fitted) %>%
mutate(index = row_number()) %>%
ungroup()
markers <- plot_data %>%
group_by(plot_level) %>%
summarize(obs = n(), expected = sum(fitted),
marker = obs - expected + 1)
n_in_group <- nrow(pred_pts_baseline)
plot_data_condensed <- plot_data %>%
# rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
mutate(rleid = cumsum(rleid)) %>%
group_by(plot_level, rleid) %>%
slice(1) %>%
group_by(plot_level) %>%
mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>%
ungroup()
ggplot(plot_data_condensed) +
geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) +
geom_line(aes(x = index, y = fitted), size = 0.3) +
geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) +
scale_fill_manual(values = c("grey90", "#2ECC40")) +
guides(fill = "none") +
facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
theme_void(base_family = "Inter Bold")
Models with NGO laws
Total legal barriers
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_total),
FUN = match_actual,
pred = pred_pts_total,
actual = as.character(canary_testing_lagged$PTS_factor_lead1))
plot_data <- pred_long_list %>%
bind_rows() %>%
filter(!is.nan(fitted), !is.na(actual)) %>%
group_by(plot_level) %>%
arrange(plot_level, fitted) %>%
mutate(index = row_number()) %>%
ungroup()
markers <- plot_data %>%
group_by(plot_level) %>%
summarize(obs = n(), expected = sum(fitted),
marker = obs - expected + 1)
n_in_group <- nrow(pred_pts_total)
plot_data_condensed <- plot_data %>%
# rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
mutate(rleid = cumsum(rleid)) %>%
group_by(plot_level, rleid) %>%
slice(1) %>%
group_by(plot_level) %>%
mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>%
ungroup()
ggplot(plot_data_condensed) +
geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) +
geom_line(aes(x = index, y = fitted), size = 0.3) +
geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) +
scale_fill_manual(values = c("grey90", "#2ECC40")) +
guides(fill = "none") +
facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
theme_void(base_family = "Inter Bold")
Barriers to advocacy
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_advocacy),
FUN = match_actual,
pred = pred_pts_advocacy,
actual = as.character(canary_testing_lagged$PTS_factor_lead1))
plot_data <- pred_long_list %>%
bind_rows() %>%
filter(!is.nan(fitted), !is.na(actual)) %>%
group_by(plot_level) %>%
arrange(plot_level, fitted) %>%
mutate(index = row_number()) %>%
ungroup()
markers <- plot_data %>%
group_by(plot_level) %>%
summarize(obs = n(), expected = sum(fitted),
marker = obs - expected + 1)
n_in_group <- nrow(pred_pts_advocacy)
plot_data_condensed <- plot_data %>%
# rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
mutate(rleid = cumsum(rleid)) %>%
group_by(plot_level, rleid) %>%
slice(1) %>%
group_by(plot_level) %>%
mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>%
ungroup()
ggplot(plot_data_condensed) +
geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) +
geom_line(aes(x = index, y = fitted), size = 0.3) +
geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) +
scale_fill_manual(values = c("grey90", "#2ECC40")) +
guides(fill = "none") +
facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
theme_void(base_family = "Inter Bold")
Barriers to entry
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_entry),
FUN = match_actual,
pred = pred_pts_entry,
actual = as.character(canary_testing_lagged$PTS_factor_lead1))
plot_data <- pred_long_list %>%
bind_rows() %>%
filter(!is.nan(fitted), !is.na(actual)) %>%
group_by(plot_level) %>%
arrange(plot_level, fitted) %>%
mutate(index = row_number()) %>%
ungroup()
markers <- plot_data %>%
group_by(plot_level) %>%
summarize(obs = n(), expected = sum(fitted),
marker = obs - expected + 1)
n_in_group <- nrow(pred_pts_entry)
plot_data_condensed <- plot_data %>%
# rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
mutate(rleid = cumsum(rleid)) %>%
group_by(plot_level, rleid) %>%
slice(1) %>%
group_by(plot_level) %>%
mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>%
ungroup()
ggplot(plot_data_condensed) +
geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) +
geom_line(aes(x = index, y = fitted), size = 0.3) +
geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) +
scale_fill_manual(values = c("grey90", "#2ECC40")) +
guides(fill = "none") +
facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
theme_void(base_family = "Inter Bold")
Barriers to funding
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_funding),
FUN = match_actual,
pred = pred_pts_funding,
actual = as.character(canary_testing_lagged$PTS_factor_lead1))
plot_data <- pred_long_list %>%
bind_rows() %>%
filter(!is.nan(fitted), !is.na(actual)) %>%
group_by(plot_level) %>%
arrange(plot_level, fitted) %>%
mutate(index = row_number()) %>%
ungroup()
markers <- plot_data %>%
group_by(plot_level) %>%
summarize(obs = n(), expected = sum(fitted),
marker = obs - expected + 1)
n_in_group <- nrow(pred_pts_funding)
plot_data_condensed <- plot_data %>%
# rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
mutate(rleid = cumsum(rleid)) %>%
group_by(plot_level, rleid) %>%
slice(1) %>%
group_by(plot_level) %>%
mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>%
ungroup()
ggplot(plot_data_condensed) +
geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) +
geom_line(aes(x = index, y = fitted), size = 0.3) +
geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) +
scale_fill_manual(values = c("grey90", "#2ECC40")) +
guides(fill = "none") +
facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
theme_void(base_family = "Inter Bold")
Prediction diagnostics
pred_pts_baseline_categories <- pred_pts_baseline %>%
rownames_to_column() %>%
pivot_longer(-rowname) %>%
group_by(rowname) %>%
filter(rank(-value, ties.method = "last") == 1) %>%
ungroup() %>%
select(predicted = name)
pred_pts_total_categories <- pred_pts_total %>%
rownames_to_column() %>%
pivot_longer(-rowname) %>%
group_by(rowname) %>%
filter(rank(-value, ties.method = "last") == 1) %>%
ungroup() %>%
select(predicted = name)
pred_pts_advocacy_categories <- pred_pts_advocacy %>%
rownames_to_column() %>%
pivot_longer(-rowname) %>%
group_by(rowname) %>%
filter(rank(-value, ties.method = "last") == 1) %>%
ungroup() %>%
select(predicted = name)
pred_pts_entry_categories <- pred_pts_entry %>%
rownames_to_column() %>%
pivot_longer(-rowname) %>%
group_by(rowname) %>%
filter(rank(-value, ties.method = "last") == 1) %>%
ungroup() %>%
select(predicted = name)
pred_pts_funding_categories <- pred_pts_funding %>%
rownames_to_column() %>%
pivot_longer(-rowname) %>%
group_by(rowname) %>%
filter(rank(-value, ties.method = "last") == 1) %>%
ungroup() %>%
select(predicted = name)
pts_cat_preds <- canary_testing_lagged %>%
select(country, year, actual = PTS_factor_lead1) %>%
mutate(pred_baseline = pred_pts_baseline_categories$predicted,
pred_total = pred_pts_total_categories$predicted,
pred_advocacy = pred_pts_advocacy_categories$predicted,
pred_entry = pred_pts_entry_categories$predicted,
pred_funding = pred_pts_funding_categories$predicted) %>%
filter(!is.na(actual)) %>%
mutate(is_correct_baseline = actual == pred_baseline,
is_correct_total = actual == pred_total,
is_correct_advocacy = actual == pred_advocacy,
is_correct_entry = actual == pred_entry,
is_correct_funding = actual == pred_funding)
# Improved predictions
pts_improved_total <- pts_cat_preds %>%
filter(!is_correct_baseline & is_correct_total) %>%
select(Country = country, Year = year, Actual = actual,
`CS repression included` = pred_total,
`Baseline` = pred_baseline)
pts_improved_advocacy <- pts_cat_preds %>%
filter(!is_correct_baseline & is_correct_advocacy) %>%
select(Country = country, Year = year, Actual = actual,
`CS repression included` = pred_advocacy,
`Baseline` = pred_baseline)
pts_improved_entry <- pts_cat_preds %>%
filter(!is_correct_baseline & is_correct_entry) %>%
select(Country = country, Year = year, Actual = actual,
`CS repression included` = pred_entry,
`Baseline` = pred_baseline)
pts_improved_funding <- pts_cat_preds %>%
filter(!is_correct_baseline & is_correct_funding) %>%
select(Country = country, Year = year, Actual = actual,
`CS repression included` = pred_funding,
`Baseline` = pred_baseline)
pts_improved_total %>%
kbl(caption = "Cases improved with total legal barriers") %>%
kable_styling()
Cases improved with total legal barriers
Country
|
Year
|
Actual
|
CS repression included
|
Baseline
|
Guatemala
|
2013
|
Level 3
|
Level 3
|
Level 2
|
Kosovo
|
2012
|
Level 1
|
Level 1
|
Level 2
|
Greece
|
2011
|
Level 2
|
Level 2
|
Level 3
|
Sierra Leone
|
2011
|
Level 3
|
Level 3
|
Level 2
|
Congo - Brazzaville
|
2011
|
Level 3
|
Level 3
|
Level 2
|
Somalia
|
2013
|
Level 5
|
Level 5
|
Level 4
|
Bahrain
|
2011
|
Level 3
|
Level 3
|
Level 2
|
Bahrain
|
2013
|
Level 3
|
Level 3
|
Level 2
|
pts_improved_advocacy %>%
kbl(caption = "Cases improved with barriers to advocacy") %>%
kable_styling()
Cases improved with barriers to advocacy
Country
|
Year
|
Actual
|
CS repression included
|
Baseline
|
Guatemala
|
2013
|
Level 3
|
Level 3
|
Level 2
|
Bulgaria
|
2012
|
Level 2
|
Level 2
|
Level 1
|
Congo - Brazzaville
|
2011
|
Level 3
|
Level 3
|
Level 2
|
Bahrain
|
2011
|
Level 3
|
Level 3
|
Level 2
|
Bahrain
|
2013
|
Level 3
|
Level 3
|
Level 2
|
pts_improved_entry %>%
kbl(caption = "Cases improved with barriers to entry") %>%
kable_styling()
Cases improved with barriers to entry
Country
|
Year
|
Actual
|
CS repression included
|
Baseline
|
Guatemala
|
2013
|
Level 3
|
Level 3
|
Level 2
|
Kosovo
|
2012
|
Level 1
|
Level 1
|
Level 2
|
Greece
|
2011
|
Level 2
|
Level 2
|
Level 3
|
Sierra Leone
|
2011
|
Level 3
|
Level 3
|
Level 2
|
Bahrain
|
2013
|
Level 3
|
Level 3
|
Level 2
|
pts_improved_funding %>%
kbl(caption = "Cases improved with barriers to funding") %>%
kable_styling()
Cases improved with barriers to funding
Country
|
Year
|
Actual
|
CS repression included
|
Baseline
|
Guatemala
|
2013
|
Level 3
|
Level 3
|
Level 2
|
Bulgaria
|
2012
|
Level 2
|
Level 2
|
Level 1
|
Sierra Leone
|
2011
|
Level 3
|
Level 3
|
Level 2
|
Congo - Brazzaville
|
2011
|
Level 3
|
Level 3
|
Level 2
|
Tunisia
|
2013
|
Level 3
|
Level 3
|
Level 2
|
Bahrain
|
2011
|
Level 3
|
Level 3
|
Level 2
|
Bahrain
|
2013
|
Level 3
|
Level 3
|
Level 2
|
correct_lookup <- tribble(
~name, ~nice_name,
"is_correct_total", "Total legal barriers included",
"is_correct_advocacy", "Barriers to advocacy included",
"is_correct_entry", "Barriers to entry included",
"is_correct_funding", "Barriers to funding included",
"is_correct_baseline", "Baseline"
) %>% mutate(nice_name = fct_inorder(nice_name, ordered = TRUE))
pts_pred_table <- pts_cat_preds %>%
pivot_longer(starts_with("is_correct")) %>%
group_by(name, value) %>%
summarize(n = n()) %>%
ungroup() %>%
left_join(correct_lookup, by = "name") %>%
mutate(Prediction = ifelse(value == TRUE, "Correct", "Wrong")) %>%
arrange(nice_name) %>%
select(-name, -value) %>%
pivot_wider(names_from = "nice_name", values_from = "n")
pts_pred_table %>%
kbl(align = "lccccc") %>%
kable_styling()
Prediction
|
Total legal barriers included
|
Barriers to advocacy included
|
Barriers to entry included
|
Barriers to funding included
|
Baseline
|
Wrong
|
107
|
110
|
108
|
108
|
109
|
Correct
|
378
|
375
|
377
|
377
|
376
|
E1b: NGO laws and latent human rights
seed_lhr_prediction <- 3207 # From random.org
set.seed(seed_lhr_prediction)
pred_lhr_baseline <- predict(m_lhr_baseline_train,
newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
allow_new_levels = TRUE)
set.seed(seed_lhr_prediction)
pred_lhr_total <- predict(m_lhr_total_train,
newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
allow_new_levels = TRUE)
set.seed(seed_lhr_prediction)
pred_lhr_advocacy <- predict(m_lhr_advocacy_train,
newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
allow_new_levels = TRUE)
set.seed(seed_lhr_prediction)
pred_lhr_entry <- predict(m_lhr_entry_train,
newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
allow_new_levels = TRUE)
set.seed(seed_lhr_prediction)
pred_lhr_funding <- predict(m_lhr_funding_train,
newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
allow_new_levels = TRUE)
pred_lhr_baseline_test <- canary_testing_lagged %>%
select(country, gwcode, year, latent_hr_mean_lead1) %>%
bind_cols(as_tibble(pred_lhr_baseline)) %>%
filter(!is.na(latent_hr_mean_lead1))
pred_lhr_total_test <- canary_testing_lagged %>%
select(country, gwcode, year, latent_hr_mean_lead1) %>%
bind_cols(as_tibble(pred_lhr_total)) %>%
filter(!is.na(latent_hr_mean_lead1))
pred_lhr_advocacy_test <- canary_testing_lagged %>%
select(country, gwcode, year, latent_hr_mean_lead1) %>%
bind_cols(as_tibble(pred_lhr_advocacy)) %>%
filter(!is.na(latent_hr_mean_lead1))
pred_lhr_entry_test <- canary_testing_lagged %>%
select(country, gwcode, year, latent_hr_mean_lead1) %>%
bind_cols(as_tibble(pred_lhr_entry)) %>%
filter(!is.na(latent_hr_mean_lead1))
pred_lhr_funding_test <- canary_testing_lagged %>%
select(country, gwcode, year, latent_hr_mean_lead1) %>%
bind_cols(as_tibble(pred_lhr_funding)) %>%
filter(!is.na(latent_hr_mean_lead1))
Improvement from baseline models
plot_baseline <- ggplot(pred_lhr_baseline_test,
aes(x = latent_hr_mean_lead1, y = Estimate)) +
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(title = "Baseline",
x = "Actual latent human rights value (t + 1)",
y = "Predicted latent human rights value (t + 1)") +
coord_equal() +
theme_ngo()
plot_total <- ggplot(pred_lhr_total_test,
aes(x = latent_hr_mean_lead1, y = Estimate)) +
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(title = "Total legal barriers",
x = "Actual latent human rights value (t + 1)",
y = "Predicted latent human rights value (t + 1)") +
coord_equal() +
theme_ngo()
plot_advocacy <- ggplot(pred_lhr_advocacy_test,
aes(x = latent_hr_mean_lead1, y = Estimate)) +
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(title = "Barriers to advocacy",
x = "Actual latent human rights value (t + 1)",
y = "Predicted latent human rights value (t + 1)") +
coord_equal() +
theme_ngo()
plot_entry <- ggplot(pred_lhr_entry_test,
aes(x = latent_hr_mean_lead1, y = Estimate)) +
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(title = "Barriers to entry",
x = "Actual latent human rights value (t + 1)",
y = "Predicted latent human rights value (t + 1)") +
coord_equal() +
theme_ngo()
plot_funding <- ggplot(pred_lhr_funding_test,
aes(x = latent_hr_mean_lead1, y = Estimate)) +
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(title = "Barriers to funding",
x = "Actual latent human rights value (t + 1)",
y = "Predicted latent human rights value (t + 1)") +
coord_equal() +
theme_ngo()
(plot_baseline | plot_total) / (plot_advocacy | plot_entry | plot_funding)
lotsa_metrics <- metric_set(rmse, rsq, mae, ccc)
bind_rows(lotsa_metrics(pred_lhr_baseline_test,
truth = latent_hr_mean_lead1,
estimate = Estimate) %>%
mutate(Model = "Baseline"),
lotsa_metrics(pred_lhr_total_test,
truth = latent_hr_mean_lead1,
estimate = Estimate) %>%
mutate(Model = "Total legal barriers"),
lotsa_metrics(pred_lhr_advocacy_test,
truth = latent_hr_mean_lead1,
estimate = Estimate) %>%
mutate(Model = "Barriers to advocacy"),
lotsa_metrics(pred_lhr_entry_test,
truth = latent_hr_mean_lead1,
estimate = Estimate) %>%
mutate(Model = "Barriers to entry"),
lotsa_metrics(pred_lhr_funding_test,
truth = latent_hr_mean_lead1,
estimate = Estimate) %>%
mutate(Model = "Barriers to funding")) %>%
select(Model, .metric, .estimate) %>%
pivot_wider(names_from = ".metric", values_from = ".estimate")
Model
|
rmse
|
rsq
|
mae
|
ccc
|
Baseline
|
0.1862156
|
0.9822788
|
0.1080593
|
0.9887926
|
Total legal barriers
|
0.1872181
|
0.9819975
|
0.1085471
|
0.9886937
|
Barriers to advocacy
|
0.1872011
|
0.9819955
|
0.1088119
|
0.9886847
|
Barriers to entry
|
0.1862635
|
0.9821720
|
0.1075320
|
0.9887725
|
Barriers to funding
|
0.1879768
|
0.9818439
|
0.1094280
|
0.9886148
|
Most improved countries
Total legal barriers
most_improved_lhr_total <- bind_cols(
pred_lhr_baseline_test %>%
mutate(error_baseline = Estimate - latent_hr_mean_lead1) %>%
select(country, gwcode, year, latent_hr_mean_lead1,
estimate_baseline = Estimate, error_baseline),
pred_lhr_total_test %>%
mutate(error_total = Estimate - latent_hr_mean_lead1) %>%
select(estimate_total = Estimate, error_total)
) %>%
mutate(improved = error_total < 0 & abs(error_total) < abs(error_baseline)) %>%
mutate(diff = error_total - error_baseline) %>%
mutate(abs_diff = abs(diff)) %>%
filter(improved) %>%
arrange(desc(abs_diff))
nrow(most_improved_lhr_total)
## [1] 135
most_improved_lhr_total %>%
select(Country = country, Year = year,
`Actual latent human rights value (t + 1)` = latent_hr_mean_lead1,
`Predicted (baseline)` = estimate_baseline,
`Predicted (CS repression)` = estimate_total,
`|CS repression − baseline|` = abs_diff) %>%
head(5) %>%
kbl(digits = 3, align = "lccccc") %>%
kable_styling()
Country
|
Year
|
Actual latent human rights value (t + 1)
|
Predicted (baseline)
|
Predicted (CS repression)
|
|CS repression − baseline|
|
Cuba
|
2011
|
-0.069
|
-0.021
|
-0.070
|
0.049
|
Hungary
|
2013
|
1.555
|
1.596
|
1.551
|
0.045
|
Indonesia
|
2013
|
-0.214
|
-0.177
|
-0.217
|
0.040
|
Belarus
|
2012
|
0.208
|
0.245
|
0.207
|
0.037
|
North Korea
|
2012
|
-1.669
|
-1.637
|
-1.671
|
0.034
|
Barriers to advocacy
most_improved_lhr_advocacy <- bind_cols(
pred_lhr_baseline_test %>%
mutate(error_baseline = Estimate - latent_hr_mean_lead1) %>%
select(country, gwcode, year, latent_hr_mean_lead1,
estimate_baseline = Estimate, error_baseline),
pred_lhr_advocacy_test %>%
mutate(error_advocacy = Estimate - latent_hr_mean_lead1) %>%
select(estimate_advocacy = Estimate, error_advocacy)
) %>%
mutate(improved = error_advocacy < 0 & abs(error_advocacy) < abs(error_baseline)) %>%
mutate(diff = error_advocacy - error_baseline) %>%
mutate(abs_diff = abs(diff)) %>%
filter(improved) %>%
arrange(desc(abs_diff))
nrow(most_improved_lhr_advocacy)
## [1] 133
most_improved_lhr_advocacy %>%
select(Country = country, Year = year,
`Actual latent human rights value (t + 1)` = latent_hr_mean_lead1,
`Predicted (baseline)` = estimate_baseline,
`Predicted (CS repression)` = estimate_advocacy,
`|CS repression − baseline|` = abs_diff) %>%
head(5) %>%
kbl(digits = 3, align = "lccccc") %>%
kable_styling()
Country
|
Year
|
Actual latent human rights value (t + 1)
|
Predicted (baseline)
|
Predicted (CS repression)
|
|CS repression − baseline|
|
Belarus
|
2012
|
0.208
|
0.245
|
0.207
|
0.037
|
North Korea
|
2012
|
-1.669
|
-1.637
|
-1.670
|
0.033
|
Cuba
|
2012
|
-0.078
|
-0.054
|
-0.087
|
0.033
|
Algeria
|
2012
|
0.469
|
0.484
|
0.455
|
0.029
|
Bhutan
|
2013
|
1.728
|
1.492
|
1.515
|
0.023
|
Barriers to entry
most_improved_lhr_entry <- bind_cols(
pred_lhr_baseline_test %>%
mutate(error_baseline = Estimate - latent_hr_mean_lead1) %>%
select(country, gwcode, year, latent_hr_mean_lead1,
estimate_baseline = Estimate, error_baseline),
pred_lhr_entry_test %>%
mutate(error_entry = Estimate - latent_hr_mean_lead1) %>%
select(estimate_entry = Estimate, error_entry)
) %>%
mutate(improved = error_entry < 0 & abs(error_entry) < abs(error_baseline)) %>%
mutate(diff = error_entry - error_baseline) %>%
mutate(abs_diff = abs(diff)) %>%
filter(improved) %>%
arrange(desc(abs_diff))
nrow(most_improved_lhr_entry)
## [1] 116
most_improved_lhr_entry %>%
select(Country = country, Year = year,
`Actual latent human rights value (t + 1)` = latent_hr_mean_lead1,
`Predicted (baseline)` = estimate_baseline,
`Predicted (CS repression)` = estimate_entry,
`|CS repression − baseline|` = abs_diff) %>%
head(5) %>%
kbl(digits = 3, align = "lccccc") %>%
kable_styling()
Country
|
Year
|
Actual latent human rights value (t + 1)
|
Predicted (baseline)
|
Predicted (CS repression)
|
|CS repression − baseline|
|
Tunisia
|
2011
|
0.261
|
0.149
|
0.174
|
0.026
|
New Zealand
|
2012
|
3.605
|
3.536
|
3.559
|
0.023
|
Cuba
|
2013
|
-0.072
|
-0.061
|
-0.073
|
0.012
|
Syria
|
2012
|
-2.156
|
-2.208
|
-2.196
|
0.012
|
Solomon Islands
|
2013
|
4.001
|
3.911
|
3.923
|
0.012
|
Barriers to funding
most_improved_lhr_funding <- bind_cols(
pred_lhr_baseline_test %>%
mutate(error_baseline = Estimate - latent_hr_mean_lead1) %>%
select(country, gwcode, year, latent_hr_mean_lead1,
estimate_baseline = Estimate, error_baseline),
pred_lhr_funding_test %>%
mutate(error_funding = Estimate - latent_hr_mean_lead1) %>%
select(estimate_funding = Estimate, error_funding)
) %>%
mutate(improved = error_funding < 0 & abs(error_funding) < abs(error_baseline)) %>%
mutate(diff = error_funding - error_baseline) %>%
mutate(abs_diff = abs(diff)) %>%
filter(improved) %>%
arrange(desc(abs_diff))
nrow(most_improved_lhr_funding)
## [1] 150
most_improved_lhr_funding %>%
select(Country = country, Year = year,
`Actual latent human rights value (t + 1)` = latent_hr_mean_lead1,
`Predicted (baseline)` = estimate_baseline,
`Predicted (CS repression)` = estimate_funding,
`|CS repression − baseline|` = abs_diff) %>%
head(5) %>%
kbl(digits = 3, align = "lccccc") %>%
kable_styling()
Country
|
Year
|
Actual latent human rights value (t + 1)
|
Predicted (baseline)
|
Predicted (CS repression)
|
|CS repression − baseline|
|
North Korea
|
2013
|
-1.687
|
-1.632
|
-1.695
|
0.063
|
Cuba
|
2011
|
-0.069
|
-0.021
|
-0.078
|
0.057
|
Eritrea
|
2011
|
-1.314
|
-1.261
|
-1.318
|
0.057
|
Eritrea
|
2013
|
-1.341
|
-1.298
|
-1.355
|
0.057
|
Eritrea
|
2012
|
-1.319
|
-1.288
|
-1.345
|
0.057
|
E2a: Civil society environment and political terror
seed_pts_prediction <- 5494 # From random.org
set.seed(seed_pts_prediction)
pred_pts_baseline <- predict(
m_pts_baseline_train,
newdata = select(canary_testing_lagged, -PTS_factor_lead1),
allow_new_levels = TRUE
) %>%
as_tibble() %>%
magrittr::set_colnames(pts_levels)
set.seed(seed_pts_prediction)
pred_pts_v2csreprss <- predict(
m_pts_v2csreprss_train,
newdata = select(canary_testing_lagged, -PTS_factor_lead1),
allow_new_levels = TRUE
) %>%
as_tibble() %>%
magrittr::set_colnames(pts_levels)
Separation plots
Baseline model
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_baseline),
FUN = match_actual,
pred = pred_pts_baseline,
actual = as.character(canary_testing_lagged$PTS_factor_lead1))
plot_data <- pred_long_list %>%
bind_rows() %>%
filter(!is.nan(fitted), !is.na(actual)) %>%
group_by(plot_level) %>%
arrange(plot_level, fitted) %>%
mutate(index = row_number()) %>%
ungroup()
markers <- plot_data %>%
group_by(plot_level) %>%
summarize(obs = n(), expected = sum(fitted),
marker = obs - expected + 1)
n_in_group <- nrow(pred_pts_baseline)
plot_data_condensed <- plot_data %>%
# rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
mutate(rleid = cumsum(rleid)) %>%
group_by(plot_level, rleid) %>%
slice(1) %>%
group_by(plot_level) %>%
mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>%
ungroup()
ggplot(plot_data_condensed) +
geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) +
geom_line(aes(x = index, y = fitted), size = 0.3) +
geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) +
scale_fill_manual(values = c("grey90", "#2ECC40")) +
guides(fill = "none") +
facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
theme_void(base_family = "Inter Bold")
Model with civil society repression
pred_long_list <- lapply(1:ncol(pred_pts_v2csreprss),
FUN = match_actual,
pred = pred_pts_v2csreprss,
actual = as.character(canary_testing_lagged$PTS_factor_lead1))
plot_data <- pred_long_list %>%
bind_rows() %>%
filter(!is.nan(fitted), !is.na(actual)) %>%
group_by(plot_level) %>%
arrange(plot_level, fitted) %>%
mutate(index = row_number()) %>%
ungroup()
markers <- plot_data %>%
group_by(plot_level) %>%
summarize(obs = n(), expected = sum(fitted),
marker = obs - expected + 1)
n_in_group <- nrow(pred_pts_v2csreprss)
plot_data_condensed <- plot_data %>%
# rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
mutate(rleid = cumsum(rleid)) %>%
group_by(plot_level, rleid) %>%
slice(1) %>%
group_by(plot_level) %>%
mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>%
ungroup()
ggplot(plot_data_condensed) +
geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) +
geom_line(aes(x = index, y = fitted), size = 0.3) +
geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) +
scale_fill_manual(values = c("grey90", "#2ECC40")) +
guides(fill = "none") +
facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
theme_void(base_family = "Inter Bold")
Prediction diagnostics
pred_pts_baseline_categories <- pred_pts_baseline %>%
rownames_to_column() %>%
pivot_longer(-rowname) %>%
group_by(rowname) %>%
filter(rank(-value, ties.method = "last") == 1) %>%
ungroup() %>%
select(predicted = name)
pred_pts_v2csreprss_categories <- pred_pts_v2csreprss %>%
rownames_to_column() %>%
pivot_longer(-rowname) %>%
group_by(rowname) %>%
filter(rank(-value, ties.method = "last") == 1) %>%
ungroup() %>%
select(predicted = name)
pts_cat_preds <- canary_testing_lagged %>%
select(country, year, actual = PTS_factor_lead1) %>%
mutate(pred_baseline = pred_pts_baseline_categories$predicted,
pred_v2csreprss = pred_pts_v2csreprss_categories$predicted) %>%
filter(!is.na(actual)) %>%
mutate(is_correct_baseline = actual == pred_baseline,
is_correct_v2csreprss = actual == pred_v2csreprss)
# Improved predictions
pts_improved <- pts_cat_preds %>%
filter(!is_correct_baseline & is_correct_v2csreprss) %>%
select(Country = country, Year = year, Actual = actual,
`CS repression included` = pred_v2csreprss,
`Baseline` = pred_baseline)
pts_improved %>%
kbl() %>%
kable_styling()
Country
|
Year
|
Actual
|
CS repression included
|
Baseline
|
Guatemala
|
2013
|
Level 3
|
Level 3
|
Level 2
|
Kosovo
|
2012
|
Level 1
|
Level 1
|
Level 2
|
Congo - Brazzaville
|
2011
|
Level 3
|
Level 3
|
Level 2
|
Libya
|
2011
|
Level 4
|
Level 4
|
Level 5
|
Bahrain
|
2011
|
Level 3
|
Level 3
|
Level 2
|
Bahrain
|
2013
|
Level 3
|
Level 3
|
Level 2
|
correct_lookup <- tribble(
~name, ~nice_name,
"is_correct_v2csreprss", "CS repression included",
"is_correct_baseline", "Baseline"
) %>% mutate(nice_name = fct_inorder(nice_name, ordered = TRUE))
pts_pred_table <- pts_cat_preds %>%
pivot_longer(starts_with("is_correct")) %>%
group_by(name, value) %>%
summarize(n = n()) %>%
ungroup() %>%
left_join(correct_lookup, by = "name") %>%
mutate(Prediction = ifelse(value == TRUE, "Correct", "Wrong")) %>%
arrange(nice_name) %>%
select(-name, -value) %>%
pivot_wider(names_from = "nice_name", values_from = "n")
pts_pred_table %>%
kbl(align = "lcc") %>%
kable_styling()
Prediction
|
CS repression included
|
Baseline
|
Wrong
|
111
|
109
|
Correct
|
374
|
376
|
E2b: Civil society environment and latent human rights
seed_lhr_prediction <- 3207 # From random.org
set.seed(seed_lhr_prediction)
pred_lhr_baseline <- predict(m_lhr_baseline_train,
newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
allow_new_levels = TRUE)
set.seed(seed_lhr_prediction)
pred_lhr_v2csreprss <- predict(m_lhr_v2csreprss_train,
newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
allow_new_levels = TRUE)
pred_lhr_baseline_test <- canary_testing_lagged %>%
select(country, gwcode, year, latent_hr_mean_lead1) %>%
bind_cols(as_tibble(pred_lhr_baseline)) %>%
filter(!is.na(latent_hr_mean_lead1))
pred_lhr_v2csreprss_test <- canary_testing_lagged %>%
select(country, gwcode, year, latent_hr_mean_lead1) %>%
bind_cols(as_tibble(pred_lhr_v2csreprss)) %>%
filter(!is.na(latent_hr_mean_lead1))
Improvement from baseline models
plot_baseline <- ggplot(pred_lhr_baseline_test,
aes(x = latent_hr_mean_lead1, y = Estimate)) +
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(title = "Baseline",
x = "Actual latent human rights value (t + 1)",
y = "Predicted latent human rights value (t + 1)") +
coord_equal() +
theme_ngo()
plot_v2csreprss <- ggplot(pred_lhr_v2csreprss_test,
aes(x = latent_hr_mean_lead1, y = Estimate)) +
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(title = "Added predictor",
x = "Actual latent human rights value (t + 1)",
y = "Predicted latent human rights value (t + 1)") +
coord_equal() +
theme_ngo()
plot_baseline | plot_v2csreprss
lotsa_metrics <- metric_set(rmse, rsq, mae, ccc)
bind_rows(lotsa_metrics(pred_lhr_baseline_test,
truth = latent_hr_mean_lead1,
estimate = Estimate) %>%
mutate(Model = "Baseline"),
lotsa_metrics(pred_lhr_v2csreprss_test,
truth = latent_hr_mean_lead1,
estimate = Estimate) %>%
mutate(Model = "Added predictor")) %>%
select(Model, .metric, .estimate) %>%
pivot_wider(names_from = ".metric", values_from = ".estimate")
Model
|
rmse
|
rsq
|
mae
|
ccc
|
Baseline
|
0.1862156
|
0.9822788
|
0.1080593
|
0.9887926
|
Added predictor
|
0.1857038
|
0.9823449
|
0.1077390
|
0.9888036
|
Most improved countries
most_improved_lhr <- bind_cols(
pred_lhr_baseline_test %>%
mutate(error_baseline = Estimate - latent_hr_mean_lead1) %>%
select(country, gwcode, year, latent_hr_mean_lead1,
estimate_baseline = Estimate, error_baseline),
pred_lhr_v2csreprss_test %>%
mutate(error_v2csreprss = Estimate - latent_hr_mean_lead1) %>%
select(estimate_v2csreprss = Estimate, error_v2csreprss)
) %>%
# mutate(across(starts_with("error"), ~round(., 4))) %>%
mutate(improved = error_v2csreprss < 0 & abs(error_v2csreprss) < abs(error_baseline)) %>%
mutate(diff = error_v2csreprss - error_baseline) %>%
mutate(abs_diff = abs(diff)) %>%
filter(improved) %>%
arrange(desc(abs_diff))
nrow(most_improved_lhr)
## [1] 127
most_improved_lhr %>%
select(Country = country, Year = year,
`Actual latent human rights value (t + 1)` = latent_hr_mean_lead1,
`Predicted (baseline)` = estimate_baseline,
`Predicted (CS repression)` = estimate_v2csreprss,
`|CS repression − baseline|` = abs_diff) %>%
head(5) %>%
kbl(digits = 3, align = "lccccc") %>%
kable_styling()
Country
|
Year
|
Actual latent human rights value (t + 1)
|
Predicted (baseline)
|
Predicted (CS repression)
|
|CS repression − baseline|
|
Libya
|
2011
|
-1.395
|
-1.818
|
-1.676
|
0.142
|
Fiji
|
2013
|
1.672
|
1.377
|
1.428
|
0.051
|
Mauritania
|
2013
|
0.558
|
0.595
|
0.555
|
0.040
|
Colombia
|
2011
|
-1.013
|
-1.090
|
-1.051
|
0.039
|
Slovakia
|
2011
|
2.019
|
1.908
|
1.943
|
0.035
|
---
title: "Predictions"
author: "Suparna Chaudhry and Andrew Heiss"
date: "Last run: `r format(Sys.time(), '%F')`"
output: 
  html_document:
    code_folding: hide
editor_options: 
  chunk_output_type: console
---

```{r setup, include=FALSE}
library(knitr)
library(kableExtra)
knit_print.data.frame <- function(x, ...) {
  res <- paste(c('', '', kable_styling(kable(x, booktabs = TRUE))), collapse = '\n')
  asis_output(res)
}

registerS3method("knit_print", "data.frame", knit_print.data.frame)
registerS3method("knit_print", "grouped_df", knit_print.data.frame)

knitr::opts_chunk$set(fig.retina = 3,
                      tidy.opts = list(width.cutoff = 120),  # For code
                      options(width = 90),  # For output
                      fig.asp = 0.618, fig.width = 7, 
                      fig.align = "center", out.width = "85%")

options(dplyr.summarise.inform = FALSE,
        knitr.kable.NA = "")
```

```{r load-libraries-data, message=FALSE, warning=FALSE}
library(tidyverse)
library(targets)
library(yardstick)
library(broom)
library(patchwork)
library(scales)
library(here)

# Load data
# Need to use this withr thing because tar_read() and tar_load() need to see the
# _targets folder in the current directory, but this .Rmd file is in a subfolder
withr::with_dir(here::here(), {
  source(tar_read(plot_funs))
  
  canary_testing_lagged <- tar_read(panel_testing_lagged)
  
  # Load big list of models
  model_df <- tar_read(model_df) %>% 
    filter(str_detect(model, "baseline") | str_detect(model, "v2csreprss"))
  
  # Load actual model objects
  tar_load(c(m_pts_baseline_train, m_pts_v2csreprss_train,
             m_pts_total_train, m_pts_advocacy_train, 
             m_pts_entry_train, m_pts_funding_train,
             m_lhr_baseline_train, m_lhr_v2csreprss_train,
             m_lhr_total_train, m_lhr_advocacy_train, 
             m_lhr_entry_train, m_lhr_funding_train))
})

pts_levels <- levels(canary_testing_lagged$PTS_factor)

# Returns a data frame of predicted probabilities with actual = 1 
# when the response outcome happens
match_actual <- function(x, pred, actual) {
  pred %>% 
    select(fitted = {{x}}) %>% 
    mutate(actual = as.numeric(actual == colnames(pred[x])),
           plot_level = colnames(pred[x]))
}
```

# E~1a~: NGO laws and political terror

```{r e1a-predictions, warning=FALSE}
seed_pts_prediction <- 5494  # From random.org

set.seed(seed_pts_prediction)
pred_pts_baseline <- predict(
  m_pts_baseline_train, 
  newdata = select(canary_testing_lagged, -PTS_factor_lead1),
  allow_new_levels = TRUE
) %>% 
  as_tibble() %>% 
  magrittr::set_colnames(pts_levels)

set.seed(seed_pts_prediction)
pred_pts_total <- predict(
  m_pts_total_train, 
  newdata = select(canary_testing_lagged, -PTS_factor_lead1),
  allow_new_levels = TRUE
) %>% 
  as_tibble() %>% 
  magrittr::set_colnames(pts_levels)

set.seed(seed_pts_prediction)
pred_pts_advocacy <- predict(
  m_pts_advocacy_train, 
  newdata = select(canary_testing_lagged, -PTS_factor_lead1),
  allow_new_levels = TRUE
) %>% 
  as_tibble() %>% 
  magrittr::set_colnames(pts_levels)

set.seed(seed_pts_prediction)
pred_pts_entry <- predict(
  m_pts_entry_train, 
  newdata = select(canary_testing_lagged, -PTS_factor_lead1),
  allow_new_levels = TRUE
) %>% 
  as_tibble() %>% 
  magrittr::set_colnames(pts_levels)

set.seed(seed_pts_prediction)
pred_pts_funding <- predict(
  m_pts_funding_train, 
  newdata = select(canary_testing_lagged, -PTS_factor_lead1),
  allow_new_levels = TRUE
) %>% 
  as_tibble() %>% 
  magrittr::set_colnames(pts_levels)
```

## Separation plots

### Baseline models

```{r e1a-separation-baseline}
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_baseline), 
                         FUN = match_actual, 
                         pred = pred_pts_baseline, 
                         actual = as.character(canary_testing_lagged$PTS_factor_lead1))

plot_data <- pred_long_list %>% 
  bind_rows() %>% 
  filter(!is.nan(fitted), !is.na(actual)) %>% 
  group_by(plot_level) %>% 
  arrange(plot_level, fitted) %>% 
  mutate(index = row_number()) %>% 
  ungroup()

markers <- plot_data %>% 
  group_by(plot_level) %>%
  summarize(obs = n(), expected = sum(fitted),
            marker = obs - expected + 1)

n_in_group <- nrow(pred_pts_baseline)

plot_data_condensed <- plot_data %>% 
  # rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
  mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
  mutate(rleid = cumsum(rleid)) %>% 
  group_by(plot_level, rleid) %>% 
  slice(1) %>% 
  group_by(plot_level) %>% 
  mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>% 
  ungroup()

ggplot(plot_data_condensed) +
  geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) + 
  geom_line(aes(x = index, y = fitted), size = 0.3) +
  geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) + 
  scale_fill_manual(values = c("grey90", "#2ECC40")) +
  guides(fill = "none") +
  facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
  theme_void(base_family = "Inter Bold")
```

### Models with NGO laws

#### Total legal barriers

```{r e1a-separation-total}
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_total), 
                         FUN = match_actual, 
                         pred = pred_pts_total, 
                         actual = as.character(canary_testing_lagged$PTS_factor_lead1))

plot_data <- pred_long_list %>% 
  bind_rows() %>% 
  filter(!is.nan(fitted), !is.na(actual)) %>% 
  group_by(plot_level) %>% 
  arrange(plot_level, fitted) %>% 
  mutate(index = row_number()) %>% 
  ungroup()

markers <- plot_data %>% 
  group_by(plot_level) %>%
  summarize(obs = n(), expected = sum(fitted),
            marker = obs - expected + 1)

n_in_group <- nrow(pred_pts_total)

plot_data_condensed <- plot_data %>% 
  # rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
  mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
  mutate(rleid = cumsum(rleid)) %>% 
  group_by(plot_level, rleid) %>% 
  slice(1) %>% 
  group_by(plot_level) %>% 
  mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>% 
  ungroup()

ggplot(plot_data_condensed) +
  geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) + 
  geom_line(aes(x = index, y = fitted), size = 0.3) +
  geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) + 
  scale_fill_manual(values = c("grey90", "#2ECC40")) +
  guides(fill = "none") +
  facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
  theme_void(base_family = "Inter Bold")
```

#### Barriers to advocacy

```{r e1a-separation-advocacy}
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_advocacy), 
                         FUN = match_actual, 
                         pred = pred_pts_advocacy, 
                         actual = as.character(canary_testing_lagged$PTS_factor_lead1))

plot_data <- pred_long_list %>% 
  bind_rows() %>% 
  filter(!is.nan(fitted), !is.na(actual)) %>% 
  group_by(plot_level) %>% 
  arrange(plot_level, fitted) %>% 
  mutate(index = row_number()) %>% 
  ungroup()

markers <- plot_data %>% 
  group_by(plot_level) %>%
  summarize(obs = n(), expected = sum(fitted),
            marker = obs - expected + 1)

n_in_group <- nrow(pred_pts_advocacy)

plot_data_condensed <- plot_data %>% 
  # rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
  mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
  mutate(rleid = cumsum(rleid)) %>% 
  group_by(plot_level, rleid) %>% 
  slice(1) %>% 
  group_by(plot_level) %>% 
  mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>% 
  ungroup()

ggplot(plot_data_condensed) +
  geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) + 
  geom_line(aes(x = index, y = fitted), size = 0.3) +
  geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) + 
  scale_fill_manual(values = c("grey90", "#2ECC40")) +
  guides(fill = "none") +
  facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
  theme_void(base_family = "Inter Bold")
```

#### Barriers to entry

```{r e1a-separation-entry}
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_entry), 
                         FUN = match_actual, 
                         pred = pred_pts_entry, 
                         actual = as.character(canary_testing_lagged$PTS_factor_lead1))

plot_data <- pred_long_list %>% 
  bind_rows() %>% 
  filter(!is.nan(fitted), !is.na(actual)) %>% 
  group_by(plot_level) %>% 
  arrange(plot_level, fitted) %>% 
  mutate(index = row_number()) %>% 
  ungroup()

markers <- plot_data %>% 
  group_by(plot_level) %>%
  summarize(obs = n(), expected = sum(fitted),
            marker = obs - expected + 1)

n_in_group <- nrow(pred_pts_entry)

plot_data_condensed <- plot_data %>% 
  # rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
  mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
  mutate(rleid = cumsum(rleid)) %>% 
  group_by(plot_level, rleid) %>% 
  slice(1) %>% 
  group_by(plot_level) %>% 
  mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>% 
  ungroup()

ggplot(plot_data_condensed) +
  geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) + 
  geom_line(aes(x = index, y = fitted), size = 0.3) +
  geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) + 
  scale_fill_manual(values = c("grey90", "#2ECC40")) +
  guides(fill = "none") +
  facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
  theme_void(base_family = "Inter Bold")
```

#### Barriers to funding

```{r e1a-separation-funding}
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_funding), 
                         FUN = match_actual, 
                         pred = pred_pts_funding, 
                         actual = as.character(canary_testing_lagged$PTS_factor_lead1))

plot_data <- pred_long_list %>% 
  bind_rows() %>% 
  filter(!is.nan(fitted), !is.na(actual)) %>% 
  group_by(plot_level) %>% 
  arrange(plot_level, fitted) %>% 
  mutate(index = row_number()) %>% 
  ungroup()

markers <- plot_data %>% 
  group_by(plot_level) %>%
  summarize(obs = n(), expected = sum(fitted),
            marker = obs - expected + 1)

n_in_group <- nrow(pred_pts_funding)

plot_data_condensed <- plot_data %>% 
  # rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
  mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
  mutate(rleid = cumsum(rleid)) %>% 
  group_by(plot_level, rleid) %>% 
  slice(1) %>% 
  group_by(plot_level) %>% 
  mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>% 
  ungroup()

ggplot(plot_data_condensed) +
  geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) + 
  geom_line(aes(x = index, y = fitted), size = 0.3) +
  geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) + 
  scale_fill_manual(values = c("grey90", "#2ECC40")) +
  guides(fill = "none") +
  facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
  theme_void(base_family = "Inter Bold")
```

## Prediction diagnostics

```{r e1a-improved-cases}
pred_pts_baseline_categories <- pred_pts_baseline %>% 
  rownames_to_column() %>% 
  pivot_longer(-rowname) %>% 
  group_by(rowname) %>% 
  filter(rank(-value, ties.method = "last") == 1) %>% 
  ungroup() %>% 
  select(predicted = name)

pred_pts_total_categories <- pred_pts_total %>% 
  rownames_to_column() %>% 
  pivot_longer(-rowname) %>% 
  group_by(rowname) %>% 
  filter(rank(-value, ties.method = "last") == 1) %>% 
  ungroup() %>% 
  select(predicted = name)

pred_pts_advocacy_categories <- pred_pts_advocacy %>% 
  rownames_to_column() %>% 
  pivot_longer(-rowname) %>% 
  group_by(rowname) %>% 
  filter(rank(-value, ties.method = "last") == 1) %>% 
  ungroup() %>% 
  select(predicted = name)

pred_pts_entry_categories <- pred_pts_entry %>% 
  rownames_to_column() %>% 
  pivot_longer(-rowname) %>% 
  group_by(rowname) %>% 
  filter(rank(-value, ties.method = "last") == 1) %>% 
  ungroup() %>% 
  select(predicted = name)

pred_pts_funding_categories <- pred_pts_funding %>% 
  rownames_to_column() %>% 
  pivot_longer(-rowname) %>% 
  group_by(rowname) %>% 
  filter(rank(-value, ties.method = "last") == 1) %>% 
  ungroup() %>% 
  select(predicted = name)


pts_cat_preds <- canary_testing_lagged %>% 
  select(country, year, actual = PTS_factor_lead1) %>% 
  mutate(pred_baseline = pred_pts_baseline_categories$predicted,
         pred_total = pred_pts_total_categories$predicted,
         pred_advocacy = pred_pts_advocacy_categories$predicted,
         pred_entry = pred_pts_entry_categories$predicted,
         pred_funding = pred_pts_funding_categories$predicted) %>% 
  filter(!is.na(actual)) %>%
  mutate(is_correct_baseline = actual == pred_baseline,
         is_correct_total = actual == pred_total,
         is_correct_advocacy = actual == pred_advocacy,
         is_correct_entry = actual == pred_entry,
         is_correct_funding = actual == pred_funding)

# Improved predictions
pts_improved_total <- pts_cat_preds %>% 
  filter(!is_correct_baseline & is_correct_total) %>% 
  select(Country = country, Year = year, Actual = actual,
         `CS repression included` = pred_total,
         `Baseline` = pred_baseline)

pts_improved_advocacy <- pts_cat_preds %>% 
  filter(!is_correct_baseline & is_correct_advocacy) %>% 
  select(Country = country, Year = year, Actual = actual,
         `CS repression included` = pred_advocacy,
         `Baseline` = pred_baseline)

pts_improved_entry <- pts_cat_preds %>% 
  filter(!is_correct_baseline & is_correct_entry) %>% 
  select(Country = country, Year = year, Actual = actual,
         `CS repression included` = pred_entry,
         `Baseline` = pred_baseline)

pts_improved_funding <- pts_cat_preds %>% 
  filter(!is_correct_baseline & is_correct_funding) %>% 
  select(Country = country, Year = year, Actual = actual,
         `CS repression included` = pred_funding,
         `Baseline` = pred_baseline)

pts_improved_total %>% 
  kbl(caption = "Cases improved with total legal barriers") %>% 
  kable_styling()

pts_improved_advocacy %>% 
  kbl(caption = "Cases improved with barriers to advocacy") %>% 
  kable_styling()

pts_improved_entry %>% 
  kbl(caption = "Cases improved with barriers to entry") %>% 
  kable_styling()

pts_improved_funding %>% 
  kbl(caption = "Cases improved with barriers to funding") %>% 
  kable_styling()
```

```{r e1a-correct-not-correct}
correct_lookup <- tribble(
  ~name, ~nice_name,
  "is_correct_total", "Total legal barriers included",
  "is_correct_advocacy", "Barriers to advocacy included",
  "is_correct_entry", "Barriers to entry included",
  "is_correct_funding", "Barriers to funding included",
  "is_correct_baseline", "Baseline"
) %>% mutate(nice_name = fct_inorder(nice_name, ordered = TRUE))

pts_pred_table <- pts_cat_preds %>% 
  pivot_longer(starts_with("is_correct")) %>% 
  group_by(name, value) %>% 
  summarize(n = n()) %>% 
  ungroup() %>% 
  left_join(correct_lookup, by = "name") %>% 
  mutate(Prediction = ifelse(value == TRUE, "Correct", "Wrong")) %>% 
  arrange(nice_name) %>% 
  select(-name, -value) %>% 
  pivot_wider(names_from = "nice_name", values_from = "n")

pts_pred_table %>% 
  kbl(align = "lccccc") %>% 
  kable_styling()
```

\ 

# E~1b~: NGO laws and latent human rights

```{r e1d-predictions, warning=FALSE}
seed_lhr_prediction <- 3207  # From random.org

set.seed(seed_lhr_prediction)
pred_lhr_baseline <- predict(m_lhr_baseline_train, 
                             newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
                             allow_new_levels = TRUE)

set.seed(seed_lhr_prediction)
pred_lhr_total <- predict(m_lhr_total_train, 
                          newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
                          allow_new_levels = TRUE)

set.seed(seed_lhr_prediction)
pred_lhr_advocacy <- predict(m_lhr_advocacy_train, 
                             newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
                             allow_new_levels = TRUE)

set.seed(seed_lhr_prediction)
pred_lhr_entry <- predict(m_lhr_entry_train, 
                          newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
                          allow_new_levels = TRUE)

set.seed(seed_lhr_prediction)
pred_lhr_funding <- predict(m_lhr_funding_train, 
                            newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
                            allow_new_levels = TRUE)

pred_lhr_baseline_test <- canary_testing_lagged %>% 
  select(country, gwcode, year, latent_hr_mean_lead1) %>% 
  bind_cols(as_tibble(pred_lhr_baseline)) %>% 
  filter(!is.na(latent_hr_mean_lead1))

pred_lhr_total_test <- canary_testing_lagged %>% 
  select(country, gwcode, year, latent_hr_mean_lead1) %>% 
  bind_cols(as_tibble(pred_lhr_total)) %>% 
  filter(!is.na(latent_hr_mean_lead1))

pred_lhr_advocacy_test <- canary_testing_lagged %>% 
  select(country, gwcode, year, latent_hr_mean_lead1) %>% 
  bind_cols(as_tibble(pred_lhr_advocacy)) %>% 
  filter(!is.na(latent_hr_mean_lead1))

pred_lhr_entry_test <- canary_testing_lagged %>% 
  select(country, gwcode, year, latent_hr_mean_lead1) %>% 
  bind_cols(as_tibble(pred_lhr_entry)) %>% 
  filter(!is.na(latent_hr_mean_lead1))

pred_lhr_funding_test <- canary_testing_lagged %>% 
  select(country, gwcode, year, latent_hr_mean_lead1) %>% 
  bind_cols(as_tibble(pred_lhr_funding)) %>% 
  filter(!is.na(latent_hr_mean_lead1))
```

## Improvement from baseline models

```{r e1d-pred-plots, warning=FALSE}
plot_baseline <- ggplot(pred_lhr_baseline_test, 
                        aes(x = latent_hr_mean_lead1, y = Estimate)) +
  geom_abline(lty = 2) +
  geom_point(alpha = 0.5) +
  labs(title = "Baseline",
       x = "Actual latent human rights value (t + 1)",
       y = "Predicted latent human rights value (t + 1)") +
  coord_equal() +
  theme_ngo()

plot_total <- ggplot(pred_lhr_total_test, 
                     aes(x = latent_hr_mean_lead1, y = Estimate)) +
  geom_abline(lty = 2) +
  geom_point(alpha = 0.5) +
  labs(title = "Total legal barriers",
       x = "Actual latent human rights value (t + 1)",
       y = "Predicted latent human rights value (t + 1)") +
  coord_equal() +
  theme_ngo()

plot_advocacy <- ggplot(pred_lhr_advocacy_test, 
                        aes(x = latent_hr_mean_lead1, y = Estimate)) +
  geom_abline(lty = 2) +
  geom_point(alpha = 0.5) +
  labs(title = "Barriers to advocacy",
       x = "Actual latent human rights value (t + 1)",
       y = "Predicted latent human rights value (t + 1)") +
  coord_equal() +
  theme_ngo()

plot_entry <- ggplot(pred_lhr_entry_test, 
                     aes(x = latent_hr_mean_lead1, y = Estimate)) +
  geom_abline(lty = 2) +
  geom_point(alpha = 0.5) +
  labs(title = "Barriers to entry",
       x = "Actual latent human rights value (t + 1)",
       y = "Predicted latent human rights value (t + 1)") +
  coord_equal() +
  theme_ngo()

plot_funding <- ggplot(pred_lhr_funding_test, 
                       aes(x = latent_hr_mean_lead1, y = Estimate)) +
  geom_abline(lty = 2) +
  geom_point(alpha = 0.5) +
  labs(title = "Barriers to funding",
       x = "Actual latent human rights value (t + 1)",
       y = "Predicted latent human rights value (t + 1)") +
  coord_equal() +
  theme_ngo()

(plot_baseline | plot_total) / (plot_advocacy | plot_entry | plot_funding)
```

```{r e1d-metrics}
lotsa_metrics <- metric_set(rmse, rsq, mae, ccc)

bind_rows(lotsa_metrics(pred_lhr_baseline_test, 
                        truth = latent_hr_mean_lead1, 
                        estimate = Estimate) %>% 
            mutate(Model = "Baseline"),
          lotsa_metrics(pred_lhr_total_test, 
                        truth = latent_hr_mean_lead1, 
                        estimate = Estimate) %>% 
            mutate(Model = "Total legal barriers"),
          lotsa_metrics(pred_lhr_advocacy_test, 
                        truth = latent_hr_mean_lead1, 
                        estimate = Estimate) %>% 
            mutate(Model = "Barriers to advocacy"),
          lotsa_metrics(pred_lhr_entry_test, 
                        truth = latent_hr_mean_lead1, 
                        estimate = Estimate) %>% 
            mutate(Model = "Barriers to entry"),
          lotsa_metrics(pred_lhr_funding_test, 
                        truth = latent_hr_mean_lead1, 
                        estimate = Estimate) %>% 
            mutate(Model = "Barriers to funding")) %>% 
  select(Model, .metric, .estimate) %>% 
  pivot_wider(names_from = ".metric", values_from = ".estimate")
```

## Most improved countries

### Total legal barriers

```{r e1d-most-improved-total}
most_improved_lhr_total <- bind_cols(
  pred_lhr_baseline_test %>% 
    mutate(error_baseline = Estimate - latent_hr_mean_lead1) %>% 
    select(country, gwcode, year, latent_hr_mean_lead1, 
           estimate_baseline = Estimate, error_baseline),
  pred_lhr_total_test %>% 
    mutate(error_total = Estimate - latent_hr_mean_lead1) %>% 
    select(estimate_total = Estimate, error_total)
) %>% 
  mutate(improved = error_total < 0 & abs(error_total) < abs(error_baseline)) %>% 
  mutate(diff = error_total - error_baseline) %>% 
  mutate(abs_diff = abs(diff)) %>% 
  filter(improved) %>% 
  arrange(desc(abs_diff))

nrow(most_improved_lhr_total)

most_improved_lhr_total %>% 
  select(Country = country, Year = year, 
         `Actual latent human rights value (t + 1)` = latent_hr_mean_lead1,
         `Predicted (baseline)` = estimate_baseline,
         `Predicted (CS repression)` = estimate_total,
         `|CS repression − baseline|` = abs_diff) %>% 
  head(5) %>% 
  kbl(digits = 3, align = "lccccc") %>% 
  kable_styling()
```

### Barriers to advocacy

```{r e1d-most-improved-advocacy}
most_improved_lhr_advocacy <- bind_cols(
  pred_lhr_baseline_test %>% 
    mutate(error_baseline = Estimate - latent_hr_mean_lead1) %>% 
    select(country, gwcode, year, latent_hr_mean_lead1, 
           estimate_baseline = Estimate, error_baseline),
  pred_lhr_advocacy_test %>% 
    mutate(error_advocacy = Estimate - latent_hr_mean_lead1) %>% 
    select(estimate_advocacy = Estimate, error_advocacy)
) %>% 
  mutate(improved = error_advocacy < 0 & abs(error_advocacy) < abs(error_baseline)) %>% 
  mutate(diff = error_advocacy - error_baseline) %>% 
  mutate(abs_diff = abs(diff)) %>% 
  filter(improved) %>% 
  arrange(desc(abs_diff))

nrow(most_improved_lhr_advocacy)

most_improved_lhr_advocacy %>% 
  select(Country = country, Year = year, 
         `Actual latent human rights value (t + 1)` = latent_hr_mean_lead1,
         `Predicted (baseline)` = estimate_baseline,
         `Predicted (CS repression)` = estimate_advocacy,
         `|CS repression − baseline|` = abs_diff) %>% 
  head(5) %>% 
  kbl(digits = 3, align = "lccccc") %>% 
  kable_styling()
```

### Barriers to entry

```{r e1d-most-improved-entry}
most_improved_lhr_entry <- bind_cols(
  pred_lhr_baseline_test %>% 
    mutate(error_baseline = Estimate - latent_hr_mean_lead1) %>% 
    select(country, gwcode, year, latent_hr_mean_lead1, 
           estimate_baseline = Estimate, error_baseline),
  pred_lhr_entry_test %>% 
    mutate(error_entry = Estimate - latent_hr_mean_lead1) %>% 
    select(estimate_entry = Estimate, error_entry)
) %>% 
  mutate(improved = error_entry < 0 & abs(error_entry) < abs(error_baseline)) %>% 
  mutate(diff = error_entry - error_baseline) %>% 
  mutate(abs_diff = abs(diff)) %>% 
  filter(improved) %>% 
  arrange(desc(abs_diff))

nrow(most_improved_lhr_entry)

most_improved_lhr_entry %>% 
  select(Country = country, Year = year, 
         `Actual latent human rights value (t + 1)` = latent_hr_mean_lead1,
         `Predicted (baseline)` = estimate_baseline,
         `Predicted (CS repression)` = estimate_entry,
         `|CS repression − baseline|` = abs_diff) %>% 
  head(5) %>% 
  kbl(digits = 3, align = "lccccc") %>% 
  kable_styling()
```

### Barriers to funding

```{r e1d-most-improved-funding}
most_improved_lhr_funding <- bind_cols(
  pred_lhr_baseline_test %>% 
    mutate(error_baseline = Estimate - latent_hr_mean_lead1) %>% 
    select(country, gwcode, year, latent_hr_mean_lead1, 
           estimate_baseline = Estimate, error_baseline),
  pred_lhr_funding_test %>% 
    mutate(error_funding = Estimate - latent_hr_mean_lead1) %>% 
    select(estimate_funding = Estimate, error_funding)
) %>% 
  mutate(improved = error_funding < 0 & abs(error_funding) < abs(error_baseline)) %>% 
  mutate(diff = error_funding - error_baseline) %>% 
  mutate(abs_diff = abs(diff)) %>% 
  filter(improved) %>% 
  arrange(desc(abs_diff))

nrow(most_improved_lhr_funding)

most_improved_lhr_funding %>% 
  select(Country = country, Year = year, 
         `Actual latent human rights value (t + 1)` = latent_hr_mean_lead1,
         `Predicted (baseline)` = estimate_baseline,
         `Predicted (CS repression)` = estimate_funding,
         `|CS repression − baseline|` = abs_diff) %>% 
  head(5) %>% 
  kbl(digits = 3, align = "lccccc") %>% 
  kable_styling()
```

\ 

---

\ 

# E~2a~: Civil society environment and political terror

```{r e2a-predictions, warning=FALSE}
seed_pts_prediction <- 5494  # From random.org

set.seed(seed_pts_prediction)
pred_pts_baseline <- predict(
  m_pts_baseline_train, 
  newdata = select(canary_testing_lagged, -PTS_factor_lead1),
  allow_new_levels = TRUE
) %>% 
  as_tibble() %>% 
  magrittr::set_colnames(pts_levels)

set.seed(seed_pts_prediction)
pred_pts_v2csreprss <- predict(
  m_pts_v2csreprss_train, 
  newdata = select(canary_testing_lagged, -PTS_factor_lead1),
  allow_new_levels = TRUE
) %>% 
  as_tibble() %>% 
  magrittr::set_colnames(pts_levels)
```


## Separation plots

### Baseline model

```{r e2a-separation-baseline}
# Match each column in pred_mat (predicted probabilities for each response level)
pred_long_list <- lapply(1:ncol(pred_pts_baseline), 
                         FUN = match_actual, 
                         pred = pred_pts_baseline, 
                         actual = as.character(canary_testing_lagged$PTS_factor_lead1))

plot_data <- pred_long_list %>% 
  bind_rows() %>% 
  filter(!is.nan(fitted), !is.na(actual)) %>% 
  group_by(plot_level) %>% 
  arrange(plot_level, fitted) %>% 
  mutate(index = row_number()) %>% 
  ungroup()

markers <- plot_data %>% 
  group_by(plot_level) %>%
  summarize(obs = n(), expected = sum(fitted),
            marker = obs - expected + 1)

n_in_group <- nrow(pred_pts_baseline)

plot_data_condensed <- plot_data %>% 
  # rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
  mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
  mutate(rleid = cumsum(rleid)) %>% 
  group_by(plot_level, rleid) %>% 
  slice(1) %>% 
  group_by(plot_level) %>% 
  mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>% 
  ungroup()

ggplot(plot_data_condensed) +
  geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) + 
  geom_line(aes(x = index, y = fitted), size = 0.3) +
  geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) + 
  scale_fill_manual(values = c("grey90", "#2ECC40")) +
  guides(fill = "none") +
  facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
  theme_void(base_family = "Inter Bold")
```

### Model with civil society repression

```{r e2a-separation-cs}
pred_long_list <- lapply(1:ncol(pred_pts_v2csreprss), 
                         FUN = match_actual, 
                         pred = pred_pts_v2csreprss, 
                         actual = as.character(canary_testing_lagged$PTS_factor_lead1))

plot_data <- pred_long_list %>% 
  bind_rows() %>% 
  filter(!is.nan(fitted), !is.na(actual)) %>% 
  group_by(plot_level) %>% 
  arrange(plot_level, fitted) %>% 
  mutate(index = row_number()) %>% 
  ungroup()

markers <- plot_data %>% 
  group_by(plot_level) %>%
  summarize(obs = n(), expected = sum(fitted),
            marker = obs - expected + 1)

n_in_group <- nrow(pred_pts_v2csreprss)

plot_data_condensed <- plot_data %>% 
  # rle(), dplyr-style: https://stackoverflow.com/a/33510765/120898
  mutate(rleid = (actual != lag(actual, 1, default = 0))) %>%
  mutate(rleid = cumsum(rleid)) %>% 
  group_by(plot_level, rleid) %>% 
  slice(1) %>% 
  group_by(plot_level) %>% 
  mutate(xmin = index, xmax = lead(index, default = n_in_group)) %>% 
  ungroup()

ggplot(plot_data_condensed) +
  geom_rect(aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = as.factor(actual))) + 
  geom_line(aes(x = index, y = fitted), size = 0.3) +
  geom_point(data = markers, aes(x = marker, y = -0.05), shape = 17, size = 3) + 
  scale_fill_manual(values = c("grey90", "#2ECC40")) +
  guides(fill = "none") +
  facet_wrap(vars(plot_level), ncol = 1, scales = "free_x") +
  theme_void(base_family = "Inter Bold")
```

## Prediction diagnostics

```{r e2a-improved-cases}
pred_pts_baseline_categories <- pred_pts_baseline %>% 
  rownames_to_column() %>% 
  pivot_longer(-rowname) %>% 
  group_by(rowname) %>% 
  filter(rank(-value, ties.method = "last") == 1) %>% 
  ungroup() %>% 
  select(predicted = name)

pred_pts_v2csreprss_categories <- pred_pts_v2csreprss %>% 
  rownames_to_column() %>% 
  pivot_longer(-rowname) %>% 
  group_by(rowname) %>% 
  filter(rank(-value, ties.method = "last") == 1) %>% 
  ungroup() %>% 
  select(predicted = name)

pts_cat_preds <- canary_testing_lagged %>% 
  select(country, year, actual = PTS_factor_lead1) %>% 
  mutate(pred_baseline = pred_pts_baseline_categories$predicted,
         pred_v2csreprss = pred_pts_v2csreprss_categories$predicted) %>% 
  filter(!is.na(actual)) %>%
  mutate(is_correct_baseline = actual == pred_baseline,
         is_correct_v2csreprss = actual == pred_v2csreprss)

# Improved predictions
pts_improved <- pts_cat_preds %>% 
  filter(!is_correct_baseline & is_correct_v2csreprss) %>% 
  select(Country = country, Year = year, Actual = actual,
         `CS repression included` = pred_v2csreprss,
         `Baseline` = pred_baseline)

pts_improved %>% 
  kbl() %>% 
  kable_styling()
```

```{r e2a-correct-not-correct}
correct_lookup <- tribble(
  ~name, ~nice_name,
  "is_correct_v2csreprss", "CS repression included",
  "is_correct_baseline", "Baseline"
) %>% mutate(nice_name = fct_inorder(nice_name, ordered = TRUE))

pts_pred_table <- pts_cat_preds %>% 
  pivot_longer(starts_with("is_correct")) %>% 
  group_by(name, value) %>% 
  summarize(n = n()) %>% 
  ungroup() %>% 
  left_join(correct_lookup, by = "name") %>% 
  mutate(Prediction = ifelse(value == TRUE, "Correct", "Wrong")) %>% 
  arrange(nice_name) %>% 
  select(-name, -value) %>% 
  pivot_wider(names_from = "nice_name", values_from = "n")

pts_pred_table %>% 
  kbl(align = "lcc") %>% 
  kable_styling()
```

\ 

# E~2b~: Civil society environment and latent human rights

```{r e2b-predictions, warning=FALSE}
seed_lhr_prediction <- 3207  # From random.org

set.seed(seed_lhr_prediction)
pred_lhr_baseline <- predict(m_lhr_baseline_train, 
                                newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
                                allow_new_levels = TRUE)

set.seed(seed_lhr_prediction)
pred_lhr_v2csreprss <- predict(m_lhr_v2csreprss_train, 
                                  newdata = select(canary_testing_lagged, -latent_hr_mean_lead1),
                                  allow_new_levels = TRUE)

pred_lhr_baseline_test <- canary_testing_lagged %>% 
  select(country, gwcode, year, latent_hr_mean_lead1) %>% 
  bind_cols(as_tibble(pred_lhr_baseline)) %>% 
  filter(!is.na(latent_hr_mean_lead1))

pred_lhr_v2csreprss_test <- canary_testing_lagged %>% 
  select(country, gwcode, year, latent_hr_mean_lead1) %>% 
  bind_cols(as_tibble(pred_lhr_v2csreprss)) %>% 
  filter(!is.na(latent_hr_mean_lead1))
```

## Improvement from baseline models

```{r e2b-pred-plots, warning=FALSE}
plot_baseline <- ggplot(pred_lhr_baseline_test, 
                        aes(x = latent_hr_mean_lead1, y = Estimate)) +
  geom_abline(lty = 2) +
  geom_point(alpha = 0.5) +
  labs(title = "Baseline",
       x = "Actual latent human rights value (t + 1)",
       y = "Predicted latent human rights value (t + 1)") +
  coord_equal() +
  theme_ngo()

plot_v2csreprss <- ggplot(pred_lhr_v2csreprss_test, 
                          aes(x = latent_hr_mean_lead1, y = Estimate)) +
  geom_abline(lty = 2) +
  geom_point(alpha = 0.5) +
  labs(title = "Added predictor",
       x = "Actual latent human rights value (t + 1)",
       y = "Predicted latent human rights value (t + 1)") +
  coord_equal() +
  theme_ngo()

plot_baseline | plot_v2csreprss
```

```{r e2b-metrics}
lotsa_metrics <- metric_set(rmse, rsq, mae, ccc)

bind_rows(lotsa_metrics(pred_lhr_baseline_test, 
                        truth = latent_hr_mean_lead1, 
                        estimate = Estimate) %>% 
            mutate(Model = "Baseline"),
          lotsa_metrics(pred_lhr_v2csreprss_test, 
                        truth = latent_hr_mean_lead1, 
                        estimate = Estimate) %>% 
            mutate(Model = "Added predictor")) %>% 
  select(Model, .metric, .estimate) %>% 
  pivot_wider(names_from = ".metric", values_from = ".estimate")
```

## Most improved countries

```{r e2b-most-improved}
most_improved_lhr <- bind_cols(
  pred_lhr_baseline_test %>% 
    mutate(error_baseline = Estimate - latent_hr_mean_lead1) %>% 
    select(country, gwcode, year, latent_hr_mean_lead1, 
           estimate_baseline = Estimate, error_baseline),
  pred_lhr_v2csreprss_test %>% 
    mutate(error_v2csreprss = Estimate - latent_hr_mean_lead1) %>% 
    select(estimate_v2csreprss = Estimate, error_v2csreprss)
) %>% 
  # mutate(across(starts_with("error"), ~round(., 4))) %>% 
  mutate(improved = error_v2csreprss < 0 & abs(error_v2csreprss) < abs(error_baseline)) %>% 
  mutate(diff = error_v2csreprss - error_baseline) %>% 
  mutate(abs_diff = abs(diff)) %>% 
  filter(improved) %>% 
  arrange(desc(abs_diff))

nrow(most_improved_lhr)

most_improved_lhr %>% 
  select(Country = country, Year = year, 
         `Actual latent human rights value (t + 1)` = latent_hr_mean_lead1,
         `Predicted (baseline)` = estimate_baseline,
         `Predicted (CS repression)` = estimate_v2csreprss,
         `|CS repression − baseline|` = abs_diff) %>% 
  head(5) %>% 
  kbl(digits = 3, align = "lccccc") %>% 
  kable_styling()
```
