Targets workflow
targets pipeline
We use the magical targets package to run our analysis and keep track of all dependencies automatically.
To build our entire project, run targets::tar_make()
at the R console.
Here’s our complete pipeline:
Actual code
All the data processing is handled with dataset-specific functions that live in R/funs_data-cleaning.R
, which targets
then runs as needed. For the sake of transparency, here’s that code:
R/funs_data-cleaning.R
Code
library(readxl)
suppressPackageStartupMessages(library(lubridate))
library(countrycode)
<- function(path) {
clean_iccpr_who <- tribble(
who_regions ~who_region, ~who_region_long,
"AFRO", "Regional Office for Africa",
"AMRO", "Regional Office for the Americas",
"SEARO", "Regional Office for South-East Asia",
"EURO", "Regional Office for Europe",
"EMRO", "Regional Office for the Eastern Mediterranean",
"WPRO", "Regional Office for the Western Pacific"
)
<- read_excel(path) %>%
x ::clean_names() %>%
janitor# Make this a date instead of PosixCT
mutate(date_reported = as.Date(date_reported)) %>%
# All NAs here are actually 0s
replace_na(list(iccpr_derogation_filed = 0,
derogation_start = 0,
derogation_ineffect = 0,
derogation_end = 0)) %>%
# Country names and codes fun times
mutate(
country_name = countrycode(
origin = "iso2c", destination = "country.name",
country_code, custom_match = c("XK" = "Kosovo", "TR" = "Türkiye")),
iso3 = countrycode(
origin = "iso2c", destination = "iso3c",
country_code, custom_match = c("XK" = "XKX")
)%>%
) left_join(who_regions, by = "who_region") %>%
# Final column order
select(-c(country_code, country, cow_code)) %>%
select(country_name, iso3, who_region, who_region_long,
day = date_reported, everything())
return(x)
}
<- function(path) {
clean_oxford <- tibble(
x # Get a list of all the sheets in the Excel file
index_name = excel_sheets(path)
%>%
) # Read each sheet
mutate(data = map(index_name, ~read_excel(path, sheet = .x))) %>%
# Standardize the index name based on the sheet name
mutate(index_name = janitor::make_clean_names(index_name)) %>%
# Make each data frame cell in the list column long and a little cleaner
mutate(clean = map(data, ~{
%>%
.x pivot_longer(cols = -c(country_code, country_name),
names_to = "day", values_to = "value") %>%
mutate(day = dmy(day))
%>%
})) # Get rid of the original wide data frame and unnest the long clean data
select(-data) %>%
unnest(clean) %>%
# Make data wide so that there's a column for each index and row for each
# country-day
pivot_wider(names_from = "index_name", values_from = "value") %>%
# Country names and codes fun times
mutate(iso3 = recode(country_code, "RKS" = "XKX")) %>%
mutate(country_name = countrycode(
origin = "iso3c", destination = "country.name",
iso3, custom_match = c("XKX" = "Kosovo", "TUR" = "Türkiye")
%>%
)) # Get rid of countries with all missing data
group_by(country_name) %>%
filter(!all(is.na(stringency_index))) %>%
ungroup() %>%
# Final column order
select(-country_code) %>%
select(country_name, iso3, day, everything())
return(x)
}
<- function(path) {
clean_pandem <- read_excel(path)
pandem_raw
<- c("None" = "0", "Minor" = "1", "Moderate" = "2", "Major" = "3")
pandem_levels
<- pandem_raw %>%
pandem_clean mutate(quarter_numeric = parse_number(quarter) / 10) %>%
mutate(year_quarter = year + quarter_numeric) %>%
mutate(iso3 = countrycode(country_name,
origin = "country.name",
destination = "iso3c"),
country_name = countrycode(iso3, origin = "iso3c",
destination = "country.name",
custom_match = c("TUR" = "Türkiye"))) %>%
select(country_name, iso3, year, year_quarter,
pandem, panback, pandem_discrim = type1,
pandem_ndrights = type2,
pandem_abusive = type3,
pandem_nolimit = type4,
pandem_media = type7) %>%
# Make these 0-3 columns factors
mutate(across(starts_with("pandem_"), ~factor(., levels = pandem_levels, ordered = TRUE))) %>%
# Add labels
mutate(across(starts_with("pandem_"), ~fct_recode(., !!!pandem_levels))) %>%
# Drop unused levels
mutate(across(starts_with("pandem_"), ~fct_drop(.)))
return(pandem_clean)
}
<- function(path) {
clean_vdem <- read_rds(path) %>% as_tibble()
vdem_raw
<- vdem_raw %>%
vdem_clean filter(year >= 2020) %>%
mutate(country_name = countrycode(
origin = "iso3c", destination = "country.name",
country_text_id, custom_match = c("XKX" = "Kosovo", "ZZB" = "Zanzibar",
"PSG" = "Palestine (Gaza)", "SML" = "Somaliland",
"TUR" = "Türkiye")
%>%
)) select(country_name, iso3 = country_text_id, year,
# Civil society stuff
# CSO repression
v2csreprss, # Core civil society index (entry/exit, repression, participatory env)
v2xcs_ccsi,
# Human rights and politics
# Political corruption index (less to more, 0-1) (public sector +
# executive + legislative + judicial corruption)
v2x_corr,# Rule of law index
v2x_rule,
# Rights indexes
# Civil liberties index
v2x_civlib, # Physical violence index
v2x_clphy, # Private civil liberties index
v2x_clpriv, # Political civil liberties index
v2x_clpol,
# Democracy
v2x_polyarchy, v2x_libdem, v2x_regime_amb
)
return(vdem_clean)
}
<- function(iccpr_who, oxford, pandem, vdem) {
create_daily_skeleton <- list(unique(iccpr_who$iso3),
all_countries unique(oxford$iso3),
unique(pandem$iso3),
unique(vdem$iso3))
<- reduce(all_countries, intersect)
countries_in_all_data
# first_day <- min(oxford$day)
<- ymd("2020-03-11")
first_day <- max(oxford$day)
last_day
<- expand_grid(
daily_skeleton iso3 = countries_in_all_data,
day = seq(first_day, last_day, by = "1 day")
%>%
) mutate(year = year(day),
year_quarter = quarter(day, type = "year.quarter")) %>%
# Pandem starts Q2 2020 on March 11 instead of April 1
mutate(year_quarter = ifelse(year_quarter == 2020.1, 2020.2, year_quarter)) %>%
mutate(country_name = countrycode(
origin = "iso3c", destination = "country.name",
iso3, custom_match = c("XKX" = "Kosovo", "TUR" = "Türkiye")
%>%
)) select(country_name, iso3, day, year, year_quarter)
return(daily_skeleton)
}
<- function(skeleton, iccpr_who, oxford, pandem, vdem) {
make_final_data <- skeleton %>%
daily_final left_join(select(iccpr_who, -country_name), by = c("iso3", "day")) %>%
left_join(select(oxford, -country_name), by = c("iso3", "day")) %>%
left_join(select(pandem, -c(country_name, year)), by = c("iso3", "year_quarter")) %>%
left_join(select(vdem, -country_name), by = c("iso3", "year"))
return(daily_final)
}
# When using a file-based target, {targets} requires that the function that
# saves the file returns a path to the file. write_csv() and write_dta() both
# invisibly return the data frame being written, and saveRDS() returns NULL, so
# we need some wrapper functions to save the files and return the paths.
<- function(df, path) {
save_csv ::write_csv(df, path)
readrreturn(path)
}
<- function(df, path) {
save_r saveRDS(df, path)
return(path)
}
<- function(df, path) {
save_dta ::write_dta(df, path)
havenreturn(path)
}