Code
library(tidyverse)
library(tidycensus)
library(sf)
library(lubridate)
# library(tsibble)
library(here)
General model:
long_term_loading ~ different_incentives_categorical + age + race + language + issuing_office + home_address_FIPS?
Outcomes:
load_after_six
: Binary indicator for whether they reloaded the card 6+ months after card is issuedtotal_amount_after_six
: Amount of money refilled 6+ months after card is issuedtotal_loadings_after_six
: Count of refills 6+ months after card is issuedTreatment:
incentive_cat
: Categorical variable showing the kind of incentive each person was given with the card, if any. Possible values are 0, 10, 15, 20, 30, 50, 70, Misc. Pass, Monthly Pass, Passport, and Subsidized Annual Pass; the values are ordered based on their intensityGeneral model:
reenrollment_in_orca ~ different_incentives_categorical + age + race + language + issuing_office + home_address_FIPS?
Outcomes:
ever_reenroll
: Binary indicator for whether the person ever reenrolled (every row for that person is all TRUE or all FALSE)reenrolled
: Binary indicator for whether current card issuing is a reenrollment (TRUE when the suffix for the card ID is greater than 1)Treatment:
General models:
long_term_loading ~ subsidized_passes + other_stuff
reenrollment_in_orca ~ subsidized_passes + other_stuff
Outcomes:
reenroll_after_study
: Binary indicator for whether the person reenrolled specifically after being assigned to one of the pseudo RCT treatment groups and receiving a passport previouslyTreatment:
treatment_passport_binary
: Binary indicator for whether the person was in Phase 1 or Phase 2 of the pseudo RCT passport programOther details related to ridership and purchasing, like:
total_amount_before_six
total_loadings_before_six
enrolled_previously
boardings_king_county_before_six
boardings_king_county_after_six
boardings_sound_transit_before_six
boardings_sound_transit_after_six
Details included in the rider registry:
Age
RaceDesc
LanguageSpoken
CardIssuingAgency
ACS 2019 block-group-level data for each rider’s home address:
bg_population
: Block-group populationbg_race
: Block-group denominator for racebg_race_white
: Count of white residentsbg_income
: Median income in block-groupTODO
Note: GEOID
in LIFT_registry.csv
is 12 digits, so it’s for the Census block group (2 + 3 + 6 + 1
, for state + county + tract + block group
)
lift_boardings1_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county",
"Question 2_ Fare Subsidies", "LIFT_boardings.csv"))
lift_boardings2_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county",
"Question 2_ Fare Subsidies",
"LIFT_boardings_2021-11-01_to_2022-03-06.csv"))
lift_sales_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county",
"Question 2_ Fare Subsidies", "LIFT_sales_2022-04-01.csv"))
lift_registry_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county",
"Question 2_ Fare Subsidies", "LIFT_registry_2022-04-01.csv"))
incentives <- c("0", "10", "15", "20", "30", "50", "70", "Misc. Pass",
"Monthly Pass", "Passport", "Subsidized Annual Pass")
incentives_labs <- c("$0", "$10", "$15", "$20", "$30", "$50", "$70", "Misc. Pass",
"Monthly Pass", "Passport", "Subsidized Annual Pass")
riders_clean <- lift_registry_raw %>%
# Split id column to get enrollment count
mutate(card_id_orig = card_id) %>%
separate(card_id, into = c("id", "times"), sep = "-") %>%
# Clean up column types
mutate(across(c(id, times), as.integer)) %>%
mutate(FIPS = as.character(FIPS)) %>%
# This date is a typo
mutate(Expiration = ifelse(Expiration == "00534984", NA, Expiration)) %>%
# Make the expiration date an actual date
mutate(Expiration = mdy(Expiration)) %>%
# Exclude rows where the expiration date is before the issued date
filter(Expiration > DateIssued) %>%
# Sometimes the same person gets enrolled twice in one day (like 2716-3)
# Only keep the last row of duplicate issue dates for individuals
group_by(id, DateIssued) %>%
slice_tail() %>%
# Renumber the times column
# group_by(id) %>%
# mutate(times = 1:n()) %>%
# Get rid of extra columns
ungroup() %>%
select(-duplicate) %>%
# Sort
arrange(id, times)
riders_clean
## # A tibble: 116,791 × 12
## Age RaceDesc LanguageSpoken Expiration DateIssued CardIssuingAgen… FIPS
## <dbl> <chr> <chr> <date> <date> <chr> <chr>
## 1 36 Black or African Ame… Other 2019-08-31 2017-06-15 KCMCCS 5303…
## 2 56 White English 2019-07-31 2017-06-05 YWCA 5303…
## 3 58 White English 2021-11-30 2019-08-28 King County Pub… 5303…
## 4 49 Black or African Ame… English 2023-11-30 2022-01-26 DSHS - ORCA LIF… 5305…
## 5 46 Asian English 2021-05-31 2019-06-30 King County Pub… 5303…
## 6 63 Asian Chinese 2019-03-31 2017-03-31 Promo Account 5303…
## 7 64 Asian Chinese 2019-06-30 2018-06-02 Promo Account 5303…
## 8 59 Asian Chinese 2019-03-31 2017-03-31 Promo Account 5303…
## 9 60 Asian Chinese 2019-06-30 2018-06-02 Promo Account 5303…
## 10 31 Black or African Ame… Somali 2019-05-31 2017-04-13 KCMCCS 5303…
## # … with 116,781 more rows, and 5 more variables: `Initial Load` <chr>,
## # `Study Card` <chr>, id <int>, times <int>, card_id_orig <chr>
sales <- lift_sales_raw %>%
group_by(week, card_id) %>%
summarize(total_amount = sum(Amount),
total_loadings = sum(loadings)) %>%
ungroup()# %>%
# as_tsibble(key = card_id, index = week)
sales
## # A tibble: 554,944 × 4
## week card_id total_amount total_loadings
## <date> <chr> <dbl> <dbl>
## 1 2017-03-05 11870-1 15 2
## 2 2017-03-05 12162-1 54 1
## 3 2017-03-05 13722-1 54 1
## 4 2017-03-05 19358-1 54 1
## 5 2017-03-05 24318-1 54 1
## 6 2017-03-05 33232-1 10 1
## 7 2017-03-05 64512-1 54 1
## 8 2017-03-05 84558-1 20 1
## 9 2017-03-05 93734-1 20 1
## 10 2017-03-05 93740-1 54 1
## # … with 554,934 more rows
# This takes a while
# sales_full_panel <- sales %>%
# fill_gaps(.full = TRUE) %>%
# replace_na(list(total_amount = 0, total_loadings = 0)) %>%
# mutate(card_id_orig = card_id) %>%
# separate(card_id, into = c("id", "times"), sep = "-") %>%
# as_tsibble(key = card_id_orig, index = week) %>%
# mutate(across(c(id, times), as.integer)) %>%
# mutate(year_week = yearweek(week, week_start = 1),
# year_week_nice = format(year_week, "%V/%Y"),
# year_month = yearmonth(week),
# year_month_nice = format(year_month, "%m/%Y"))
boardings <- lift_boardings1_raw %>%
bind_rows(lift_boardings2_raw) %>%
select(week, card_id,
boardings_king_county = `King County Metro`,
boardings_sound_transit = `Sound Transit`) %>%
mutate(card_id_orig = card_id) %>%
separate(card_id, into = c("id", "times"), sep = "-") %>%
mutate(across(c(id, times), as.integer)) %>%
mutate(across(starts_with("boardings"), ~replace_na(., 0)))# %>%
# as_tsibble(key = card_id_orig, index = week)
boardings
## # A tibble: 1,783,168 × 6
## week id times boardings_king_county boardings_sound_transit card_id_orig
## <date> <int> <int> <dbl> <dbl> <chr>
## 1 2017-03-12 94262 1 8 0 94262-1
## 2 2017-03-19 94262 1 4 0 94262-1
## 3 2017-03-26 94262 1 2 0 94262-1
## 4 2017-04-02 94262 1 4 0 94262-1
## 5 2017-04-16 94262 1 2 0 94262-1
## 6 2017-07-09 94262 1 2 0 94262-1
## 7 2017-07-16 94262 1 2 0 94262-1
## 8 2017-07-30 94262 1 3 0 94262-1
## 9 2017-08-06 94262 1 2 0 94262-1
## 10 2017-08-20 94262 1 3 0 94262-1
## # … with 1,783,158 more rows
# See a list of variable names
# Also available at https://api.census.gov/data/2020/acs/acs5/variables.html
# acs_possible_vars <- load_variables(2020, "acs5", cache = TRUE)
acs_vars <- tribble(
~name, ~var_name, ~description,
"B01003_001", "bg_population", "Total population",
"B02001_001", "bg_race_denom", "Race denominator",
"B02001_002", "bg_race_white", "Race: White alone",
"B19013_001", "bg_income", "Median household income",
"B01001_001", "bg_age_denom", "Age denominator",
"B01001_002", "bg_male", "Male",
"B15003_001", "bg_educ_denom", "Education denominator",
"B15003_017", "bg_hs", "High school",
"B15003_022", "bg_ba", "Bachelor's degree",
"B08134_061", "bg_pub_trans", "Minutes on public transportation",
"B08303_001", "bg_travel_time", "Total travel time",
"B25064_001", "bg_median_rent", "Median rent",
"B28002_001", "bg_internet_denom", "Internet denominator",
"B28002_013", "bg_no_internet", "No internet"
)
# Create a named vector to pass to get_acs()
vars_to_get <- acs_vars$name %>%
set_names(acs_vars$var_name)
# Get 2019 ACS data
# 2020 would be neat, but ≈20% of it is missing, ugh
# 2020 decennial would be neat too, but it's a huge mess
acs_raw <- get_acs(geography = "block group",
variables = vars_to_get,
state = 53, year = 2019, survey = "acs5")
# Make the data wide
acs_wa <- acs_raw %>%
select(-NAME, -moe) %>%
pivot_wider(names_from = "variable", values_from = "estimate") %>%
mutate(bg_pct_white = bg_race_white / bg_race_denom,
bg_pct_male = bg_male / bg_population,
bg_travel_time_per_capita = bg_travel_time / bg_population,
bg_pub_trans_per_capita = bg_pub_trans / bg_population,
bg_pct_hs = bg_hs / bg_educ_denom,
bg_pct_ba = bg_ba / bg_educ_denom,
bg_internet = 1 - (bg_no_internet / bg_internet_denom)) %>%
select(GEOID, bg_pct_white, bg_pct_male, bg_pct_hs, bg_pct_ba,
bg_internet, bg_travel_time_per_capita, bg_pub_trans_per_capita,
bg_income, bg_median_rent)
# Shapefiles for WA block groups
# Get block group boundaries
wa_bgs <- tigris::block_groups(state = 53, cb = FALSE, year = 2019)
# Make a smaller dataset of just IDs and enrollment dates and then make a column
# of enrollment dates + 180 days - we'll use this to process and collapse the
# sales and boarding data
rider_enrollment_dates <- riders_clean %>%
select(id, times, DateIssued) %>%
mutate(six_months_later = DateIssued + days(180))
rider_enrollment_dates
## # A tibble: 116,791 × 4
## id times DateIssued six_months_later
## <int> <int> <date> <date>
## 1 568 1 2017-06-15 2017-12-12
## 2 2690 1 2017-06-05 2017-12-02
## 3 2690 2 2019-08-28 2020-02-24
## 4 2704 1 2022-01-26 2022-07-25
## 5 2706 1 2019-06-30 2019-12-27
## 6 2708 1 2017-03-31 2017-09-27
## 7 2708 2 2018-06-02 2018-11-29
## 8 2710 1 2017-03-31 2017-09-27
## 9 2710 2 2018-06-02 2018-11-29
## 10 2716 1 2017-04-13 2017-10-10
## # … with 116,781 more rows
# Calculate how much money riders put on cards + frequency of refills in the 0-6
# and 6+ months after getting issued a card
sales_before_after_six_months <- sales %>%
# Split id column to get enrollment count
mutate(card_id_orig = card_id) %>%
separate(card_id, into = c("id", "times"), sep = "-") %>%
# Clean up column types
mutate(across(c(id, times), as.integer)) %>%
# Bring in rider dates
left_join(rider_enrollment_dates, by = c("id", "times")) %>%
# Get rid of rider/time combinations that don't exist
filter(!is.na(DateIssued)) %>%
# Create indicator for whether each sale is 6+ months after the initial care issue
mutate(after_six = (week - days(6)) > six_months_later,
after_six = ifelse(after_six, "after_six", "before_six")) %>%
# Get total of money loaded and frequency of loading before/after 6+ months
group_by(id, times, after_six) %>%
summarize(across(c(total_amount, total_loadings), ~sum(.))) %>%
ungroup() %>%
# Make wide
pivot_wider(names_from = after_six, values_from = c(total_amount, total_loadings)) %>%
mutate(load_after_six = total_amount_after_six > 0)
sales_before_after_six_months
## # A tibble: 61,047 × 7
## id times total_amount_after_six total_amount_befo… total_loadings_… total_loadings_…
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 568 1 30 NA 2 NA
## 2 2690 1 310 293. 15 10
## 3 2706 1 NA 82.5 NA 3
## 4 2708 1 NA 6 NA 1
## 5 2710 1 NA 11 NA 1
## 6 2716 1 900 200 7 1
## 7 2716 3 1003 210 19 17
## 8 2718 1 498 335 10 9
## 9 2718 2 NA 220 NA 8
## 10 2720 1 NA 3 NA 1
## # … with 61,037 more rows, and 1 more variable: load_after_six <lgl>
boardings_before_after_six_months <- boardings %>%
# Bring in rider dates
left_join(rider_enrollment_dates, by = c("id", "times")) %>%
# Get rid of rider/time combinations that don't exist
filter(!is.na(DateIssued)) %>%
# Create indicator for whether each sale is 6+ months after the initial care issue
mutate(after_six = (week - days(6)) > six_months_later,
after_six = ifelse(after_six, "after_six", "before_six")) %>%
# Get total boardings before/after 6+ months
group_by(id, times, after_six) %>%
summarize(across(starts_with("boardings_"), ~sum(.))) %>%
ungroup() %>%
# Make wide
pivot_wider(names_from = after_six, values_from = starts_with("boardings_"))
boardings_before_after_six_months
## # A tibble: 87,963 × 6
## id times boardings_king_county_a… boardings_king_… boardings_sound… boardings_sound…
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 568 1 0 NA 2 NA
## 2 2690 1 310 247 194 68
## 3 2690 2 0 1 3 0
## 4 2704 1 NA 2 NA 2
## 5 2706 1 14 34 0 32
## 6 2708 1 5 2 0 0
## 7 2708 2 NA 0 NA 6
## 8 2710 1 7 2 0 1
## 9 2710 2 0 NA 2 NA
## 10 2716 1 1358 442 564 25
## # … with 87,953 more rows
riders_final <- riders_clean %>%
# Create treatment variables
# Consider NA initial loads to be 0
replace_na(list(`Initial Load` = "0")) %>%
mutate(incentive_cat = factor(`Initial Load`, levels = incentives,
labels = incentives_labs, ordered = TRUE),
treatment_passport_binary = !is.na(`Study Card`),
treatment_sap_binary = incentive_cat == "Subsidized Annual Pass") %>%
group_by(id) %>%
mutate(incentive_cat_collapsed = fct_collapse(
incentive_cat,
">$10" = c("$15", "$20", "$30", "$50", "$70"),
"Shorter Pass" = c("Misc. Pass", "Monthly Pass", "Passport"))) %>%
mutate(across(c(incentive_cat, incentive_cat_collapsed, treatment_passport_binary, treatment_sap_binary),
~lag(.), .names = "{.col}_prev")) %>%
#
# Create outcome variables
# Bring in collapsed sales data for reloading outcome
left_join(sales_before_after_six_months, by = c("id", "times")) %>%
# Create reenrollment outcomes
group_by(id) %>%
mutate(ever_reenroll = any(times > 1),
reenrolled = times > 1,
enrolled_previously = n() != 1,
reenroll_after_study = treatment_passport_binary_prev & ever_reenroll) %>%
ungroup() %>%
#
# Create controls and confounders
# Bring in collapsed boarding data for ride use history
left_join(boardings_before_after_six_months, by = c("id", "times")) %>%
# Bring in census data
left_join(acs_wa, by = c("FIPS" = "GEOID")) %>%
#
# Final data cleaning
# Replace NAs with actual data
mutate(across(c(starts_with("total_amount"), starts_with("total_loadings"),
starts_with("boardings_")),
~replace_na(., 0))) %>%
# Add "Nothing" as a category
mutate(across(starts_with("incentive_cat"), ~fct_expand(.x, "Nothing"))) %>%
replace_na(list(load_after_six = FALSE, treatment_passport_binary_prev = FALSE,
treatment_sap_binary_prev = FALSE,
incentive_cat_prev = "Nothing", incentive_cat_collapsed_prev = "Nothing")) %>%
# Put "Nothing" first so it's the reference category
mutate(across(starts_with("incentive_cat"), ~fct_relevel(.x, "Nothing"))) %>%
# Get rid of "Nothing" if there aren't any
mutate(across(starts_with("incentive_cat"), ~fct_drop(.x, only = "Nothing")))
# MAYBE: Total money spent before 2019/2020 enrollment
# MAYBE: Total refills before 2019/2020 enrollment
# Only look at rows starting on March 13, 2019, since that's when the study
# formally began
riders_final_2019 <- riders_final %>%
filter(DateIssued >= ymd("2019-03-13"))
glimpse(riders_final_2019)
## Rows: 61,277
## Columns: 42
## $ Age <dbl> 58, 49, 46, 33, 63, 47, 57, 59, 56, 49, 51, 3…
## $ RaceDesc <chr> "White", "Black or African American", "Asian"…
## $ LanguageSpoken <chr> "English", "English", "English", "Somali", "E…
## $ Expiration <date> 2021-11-30, 2023-11-30, 2021-05-31, 2021-08-…
## $ DateIssued <date> 2019-08-28, 2022-01-26, 2019-06-30, 2019-06-…
## $ CardIssuingAgency <chr> "King County Public Health", "DSHS - ORCA LIF…
## $ FIPS <chr> "530330040001", "530530609051", "530330061002…
## $ `Initial Load` <chr> "0", "10", "0", "0", "0", "0", "0", "10", "10…
## $ `Study Card` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ id <int> 2690, 2704, 2706, 2716, 2718, 2720, 2724, 272…
## $ times <int> 2, 1, 1, 3, 2, 2, 2, 3, 3, 2, 3, 1, 3, 1, 2, …
## $ card_id_orig <chr> "2690-2", "2704-1", "2706-1", "2716-3", "2718…
## $ incentive_cat <ord> $0, $10, $0, $0, $0, $0, $0, $10, $10, $0, $1…
## $ treatment_passport_binary <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ treatment_sap_binary <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ incentive_cat_collapsed <ord> $0, $10, $0, $0, $0, $0, $0, $10, $10, $0, $1…
## $ incentive_cat_prev <ord> $0, Nothing, Nothing, $0, $0, $0, $10, $0, Mo…
## $ incentive_cat_collapsed_prev <ord> $0, Nothing, Nothing, $0, $0, $0, $10, $0, Sh…
## $ treatment_passport_binary_prev <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ treatment_sap_binary_prev <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ total_amount_after_six <dbl> 0, 0, 0, 1003, 0, 0, 10, 0, 0, 220, 0, 0, 0, …
## $ total_amount_before_six <dbl> 0.0, 0.0, 82.5, 210.0, 220.0, 0.0, 18.0, 0.0,…
## $ total_loadings_after_six <dbl> 0, 0, 0, 19, 0, 0, 1, 0, 0, 11, 0, 0, 0, 0, 4…
## $ total_loadings_before_six <dbl> 0, 0, 3, 17, 8, 0, 2, 0, 0, 4, 3, 2, 0, 0, 0,…
## $ load_after_six <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE…
## $ ever_reenroll <lgl> TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, T…
## $ reenrolled <lgl> TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, T…
## $ enrolled_previously <lgl> TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, T…
## $ reenroll_after_study <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ boardings_king_county_after_six <dbl> 0, 0, 14, 370, 0, 0, 6, 0, 231, 299, 0, 8, 0,…
## $ boardings_king_county_before_six <dbl> 1, 2, 34, 304, 171, 0, 15, 0, 155, 91, 53, 42…
## $ boardings_sound_transit_after_six <dbl> 3, 0, 0, 133, 0, 0, 4, 0, 6, 3, 0, 2, 0, 0, 9…
## $ boardings_sound_transit_before_six <dbl> 0, 2, 32, 165, 4, 0, 0, 0, 4, 5, 1, 6, 0, 0, …
## $ bg_pct_white <dbl> 0.631, 0.698, 0.802, 0.158, 0.562, 0.627, 0.6…
## $ bg_pct_male <dbl> 0.460, 0.467, 0.481, 0.513, 0.528, 0.483, 0.5…
## $ bg_pct_hs <dbl> 0.0526, 0.2432, 0.0330, 0.1362, 0.0738, 0.099…
## $ bg_pct_ba <dbl> 0.316, 0.129, 0.280, 0.289, 0.252, 0.445, 0.2…
## $ bg_internet <dbl> 0.981, 0.689, 0.937, 0.783, 0.945, 1.000, 1.0…
## $ bg_travel_time_per_capita <dbl> 0.504, 0.391, 0.544, 0.381, 0.647, 0.641, 0.4…
## $ bg_pub_trans_per_capita <dbl> 0.11825, 0.00869, 0.12396, 0.15455, 0.21778, …
## $ bg_income <dbl> 71500, 47917, 115176, 43125, 60739, 70047, 10…
## $ bg_median_rent <dbl> 1343, 974, 1484, 1019, 1276, 1410, 1299, 1299…
---
title: "Data cleaning"
editor_options:
chunk_output_type: inline
format:
html:
code-fold: "show"
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(fig.align = "center", fig.retina = 2,
fig.width = 6, fig.height = (6 * 0.618),
out.width = "80%", collapse = TRUE,
dev = "png", dev.args = list(type = "cairo-png"))
options(digits = 3, width = 90,
dplyr.summarise.inform = FALSE)
```
```{r load-packages, warning=FALSE, message=FALSE}
library(tidyverse)
library(tidycensus)
library(sf)
library(lubridate)
# library(tsibble)
library(here)
```
# Research questions and variables
## Q2
### Q2-A: Effect of different levels of incentives on longer-term loading of value and passes
General model:
- `long_term_loading ~ different_incentives_categorical + age + race + language + issuing_office + home_address_FIPS?`
Outcomes:
- `load_after_six`: Binary indicator for whether they reloaded the card 6+ months after card is issued
- `total_amount_after_six`: Amount of money refilled 6+ months after card is issued
- `total_loadings_after_six`: Count of refills 6+ months after card is issued
Treatment:
- `incentive_cat`: Categorical variable showing the kind of incentive each person was given with the card, if any. Possible values are 0, 10, 15, 20, 30, 50, 70, Misc. Pass, Monthly Pass, Passport, and Subsidized Annual Pass; the values are ordered based on their intensity
### Q2-B: Effect of different levels of incentives on re-enrollment in ORCA LIFT
General model:
- `reenrollment_in_orca ~ different_incentives_categorical + age + race + language + issuing_office + home_address_FIPS?`
Outcomes:
- `ever_reenroll`: Binary indicator for whether the person ever reenrolled (every row for that person is all TRUE or all FALSE)
- `reenrolled`: Binary indicator for whether current card issuing is a reenrollment (TRUE when the suffix for the card ID is greater than 1)
Treatment:
- Same as Q2-A
### Q2-C: Effects of subsidized passes through RCTs on these two outcome variables
General models:
- `long_term_loading ~ subsidized_passes + other_stuff`
- `reenrollment_in_orca ~ subsidized_passes + other_stuff`
Outcomes:
- Same as Q2-A and Q2-B, and maybe...
- `reenroll_after_study`: Binary indicator for whether the person reenrolled specifically after being assigned to one of the pseudo RCT treatment groups and receiving a passport previously
Treatment:
- Same as Q2-A and Q2-B, and maybe...
- `treatment_passport_binary`: Binary indicator for whether the person was in Phase 1 or Phase 2 of the pseudo RCT passport program
### Controls
Other details related to ridership and purchasing, like:
- `total_amount_before_six`
- `total_loadings_before_six`
- `enrolled_previously`
- `boardings_king_county_before_six`
- `boardings_king_county_after_six`
- `boardings_sound_transit_before_six`
- `boardings_sound_transit_after_six`
Details included in the rider registry:
- `Age`
- `RaceDesc`
- `LanguageSpoken`
- `CardIssuingAgency`
ACS 2019 block-group-level data for each rider's home address:
- `bg_population`: Block-group population
- `bg_race`: Block-group denominator for race
- `bg_race_white`: Count of white residents
- `bg_income`: Median income in block-group
- And many others TODO
# Data cleaning for Q1
TODO
# Data cleaning for Q2
Note: `GEOID` in `LIFT_registry.csv` is 12 digits, so it's for the [Census block group](https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html) (`2 + 3 + 6 + 1`, for `state + county + tract + block group`)
```{r load-q2-data, warning=FALSE, message=FALSE}
lift_boardings1_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county",
"Question 2_ Fare Subsidies", "LIFT_boardings.csv"))
lift_boardings2_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county",
"Question 2_ Fare Subsidies",
"LIFT_boardings_2021-11-01_to_2022-03-06.csv"))
lift_sales_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county",
"Question 2_ Fare Subsidies", "LIFT_sales_2022-04-01.csv"))
lift_registry_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county",
"Question 2_ Fare Subsidies", "LIFT_registry_2022-04-01.csv"))
```
## Rider data
```{r clean-riders-data}
incentives <- c("0", "10", "15", "20", "30", "50", "70", "Misc. Pass",
"Monthly Pass", "Passport", "Subsidized Annual Pass")
incentives_labs <- c("$0", "$10", "$15", "$20", "$30", "$50", "$70", "Misc. Pass",
"Monthly Pass", "Passport", "Subsidized Annual Pass")
riders_clean <- lift_registry_raw %>%
# Split id column to get enrollment count
mutate(card_id_orig = card_id) %>%
separate(card_id, into = c("id", "times"), sep = "-") %>%
# Clean up column types
mutate(across(c(id, times), as.integer)) %>%
mutate(FIPS = as.character(FIPS)) %>%
# This date is a typo
mutate(Expiration = ifelse(Expiration == "00534984", NA, Expiration)) %>%
# Make the expiration date an actual date
mutate(Expiration = mdy(Expiration)) %>%
# Exclude rows where the expiration date is before the issued date
filter(Expiration > DateIssued) %>%
# Sometimes the same person gets enrolled twice in one day (like 2716-3)
# Only keep the last row of duplicate issue dates for individuals
group_by(id, DateIssued) %>%
slice_tail() %>%
# Renumber the times column
# group_by(id) %>%
# mutate(times = 1:n()) %>%
# Get rid of extra columns
ungroup() %>%
select(-duplicate) %>%
# Sort
arrange(id, times)
riders_clean
```
## Sales data
```{r clean-sales-data}
sales <- lift_sales_raw %>%
group_by(week, card_id) %>%
summarize(total_amount = sum(Amount),
total_loadings = sum(loadings)) %>%
ungroup()# %>%
# as_tsibble(key = card_id, index = week)
sales
# This takes a while
# sales_full_panel <- sales %>%
# fill_gaps(.full = TRUE) %>%
# replace_na(list(total_amount = 0, total_loadings = 0)) %>%
# mutate(card_id_orig = card_id) %>%
# separate(card_id, into = c("id", "times"), sep = "-") %>%
# as_tsibble(key = card_id_orig, index = week) %>%
# mutate(across(c(id, times), as.integer)) %>%
# mutate(year_week = yearweek(week, week_start = 1),
# year_week_nice = format(year_week, "%V/%Y"),
# year_month = yearmonth(week),
# year_month_nice = format(year_month, "%m/%Y"))
```
## Boardings data
```{r clean-boardings-data}
boardings <- lift_boardings1_raw %>%
bind_rows(lift_boardings2_raw) %>%
select(week, card_id,
boardings_king_county = `King County Metro`,
boardings_sound_transit = `Sound Transit`) %>%
mutate(card_id_orig = card_id) %>%
separate(card_id, into = c("id", "times"), sep = "-") %>%
mutate(across(c(id, times), as.integer)) %>%
mutate(across(starts_with("boardings"), ~replace_na(., 0)))# %>%
# as_tsibble(key = card_id_orig, index = week)
boardings
```
## Census data
```{r get-clean-census, cache=TRUE, message=FALSE, warning=FALSE, results="hide"}
# See a list of variable names
# Also available at https://api.census.gov/data/2020/acs/acs5/variables.html
# acs_possible_vars <- load_variables(2020, "acs5", cache = TRUE)
acs_vars <- tribble(
~name, ~var_name, ~description,
"B01003_001", "bg_population", "Total population",
"B02001_001", "bg_race_denom", "Race denominator",
"B02001_002", "bg_race_white", "Race: White alone",
"B19013_001", "bg_income", "Median household income",
"B01001_001", "bg_age_denom", "Age denominator",
"B01001_002", "bg_male", "Male",
"B15003_001", "bg_educ_denom", "Education denominator",
"B15003_017", "bg_hs", "High school",
"B15003_022", "bg_ba", "Bachelor's degree",
"B08134_061", "bg_pub_trans", "Minutes on public transportation",
"B08303_001", "bg_travel_time", "Total travel time",
"B25064_001", "bg_median_rent", "Median rent",
"B28002_001", "bg_internet_denom", "Internet denominator",
"B28002_013", "bg_no_internet", "No internet"
)
# Create a named vector to pass to get_acs()
vars_to_get <- acs_vars$name %>%
set_names(acs_vars$var_name)
# Get 2019 ACS data
# 2020 would be neat, but ≈20% of it is missing, ugh
# 2020 decennial would be neat too, but it's a huge mess
acs_raw <- get_acs(geography = "block group",
variables = vars_to_get,
state = 53, year = 2019, survey = "acs5")
# Make the data wide
acs_wa <- acs_raw %>%
select(-NAME, -moe) %>%
pivot_wider(names_from = "variable", values_from = "estimate") %>%
mutate(bg_pct_white = bg_race_white / bg_race_denom,
bg_pct_male = bg_male / bg_population,
bg_travel_time_per_capita = bg_travel_time / bg_population,
bg_pub_trans_per_capita = bg_pub_trans / bg_population,
bg_pct_hs = bg_hs / bg_educ_denom,
bg_pct_ba = bg_ba / bg_educ_denom,
bg_internet = 1 - (bg_no_internet / bg_internet_denom)) %>%
select(GEOID, bg_pct_white, bg_pct_male, bg_pct_hs, bg_pct_ba,
bg_internet, bg_travel_time_per_capita, bg_pub_trans_per_capita,
bg_income, bg_median_rent)
# Shapefiles for WA block groups
# Get block group boundaries
wa_bgs <- tigris::block_groups(state = 53, cb = FALSE, year = 2019)
```
## Merge data
```{r merge-data}
# Make a smaller dataset of just IDs and enrollment dates and then make a column
# of enrollment dates + 180 days - we'll use this to process and collapse the
# sales and boarding data
rider_enrollment_dates <- riders_clean %>%
select(id, times, DateIssued) %>%
mutate(six_months_later = DateIssued + days(180))
rider_enrollment_dates
# Calculate how much money riders put on cards + frequency of refills in the 0-6
# and 6+ months after getting issued a card
sales_before_after_six_months <- sales %>%
# Split id column to get enrollment count
mutate(card_id_orig = card_id) %>%
separate(card_id, into = c("id", "times"), sep = "-") %>%
# Clean up column types
mutate(across(c(id, times), as.integer)) %>%
# Bring in rider dates
left_join(rider_enrollment_dates, by = c("id", "times")) %>%
# Get rid of rider/time combinations that don't exist
filter(!is.na(DateIssued)) %>%
# Create indicator for whether each sale is 6+ months after the initial care issue
mutate(after_six = (week - days(6)) > six_months_later,
after_six = ifelse(after_six, "after_six", "before_six")) %>%
# Get total of money loaded and frequency of loading before/after 6+ months
group_by(id, times, after_six) %>%
summarize(across(c(total_amount, total_loadings), ~sum(.))) %>%
ungroup() %>%
# Make wide
pivot_wider(names_from = after_six, values_from = c(total_amount, total_loadings)) %>%
mutate(load_after_six = total_amount_after_six > 0)
sales_before_after_six_months
boardings_before_after_six_months <- boardings %>%
# Bring in rider dates
left_join(rider_enrollment_dates, by = c("id", "times")) %>%
# Get rid of rider/time combinations that don't exist
filter(!is.na(DateIssued)) %>%
# Create indicator for whether each sale is 6+ months after the initial care issue
mutate(after_six = (week - days(6)) > six_months_later,
after_six = ifelse(after_six, "after_six", "before_six")) %>%
# Get total boardings before/after 6+ months
group_by(id, times, after_six) %>%
summarize(across(starts_with("boardings_"), ~sum(.))) %>%
ungroup() %>%
# Make wide
pivot_wider(names_from = after_six, values_from = starts_with("boardings_"))
boardings_before_after_six_months
riders_final <- riders_clean %>%
# Create treatment variables
# Consider NA initial loads to be 0
replace_na(list(`Initial Load` = "0")) %>%
mutate(incentive_cat = factor(`Initial Load`, levels = incentives,
labels = incentives_labs, ordered = TRUE),
treatment_passport_binary = !is.na(`Study Card`),
treatment_sap_binary = incentive_cat == "Subsidized Annual Pass") %>%
group_by(id) %>%
mutate(incentive_cat_collapsed = fct_collapse(
incentive_cat,
">$10" = c("$15", "$20", "$30", "$50", "$70"),
"Shorter Pass" = c("Misc. Pass", "Monthly Pass", "Passport"))) %>%
mutate(across(c(incentive_cat, incentive_cat_collapsed, treatment_passport_binary, treatment_sap_binary),
~lag(.), .names = "{.col}_prev")) %>%
#
# Create outcome variables
# Bring in collapsed sales data for reloading outcome
left_join(sales_before_after_six_months, by = c("id", "times")) %>%
# Create reenrollment outcomes
group_by(id) %>%
mutate(ever_reenroll = any(times > 1),
reenrolled = times > 1,
enrolled_previously = n() != 1,
reenroll_after_study = treatment_passport_binary_prev & ever_reenroll) %>%
ungroup() %>%
#
# Create controls and confounders
# Bring in collapsed boarding data for ride use history
left_join(boardings_before_after_six_months, by = c("id", "times")) %>%
# Bring in census data
left_join(acs_wa, by = c("FIPS" = "GEOID")) %>%
#
# Final data cleaning
# Replace NAs with actual data
mutate(across(c(starts_with("total_amount"), starts_with("total_loadings"),
starts_with("boardings_")),
~replace_na(., 0))) %>%
# Add "Nothing" as a category
mutate(across(starts_with("incentive_cat"), ~fct_expand(.x, "Nothing"))) %>%
replace_na(list(load_after_six = FALSE, treatment_passport_binary_prev = FALSE,
treatment_sap_binary_prev = FALSE,
incentive_cat_prev = "Nothing", incentive_cat_collapsed_prev = "Nothing")) %>%
# Put "Nothing" first so it's the reference category
mutate(across(starts_with("incentive_cat"), ~fct_relevel(.x, "Nothing"))) %>%
# Get rid of "Nothing" if there aren't any
mutate(across(starts_with("incentive_cat"), ~fct_drop(.x, only = "Nothing")))
# MAYBE: Total money spent before 2019/2020 enrollment
# MAYBE: Total refills before 2019/2020 enrollment
# Only look at rows starting on March 13, 2019, since that's when the study
# formally began
riders_final_2019 <- riders_final %>%
filter(DateIssued >= ymd("2019-03-13"))
glimpse(riders_final_2019)
```
## Save data
```{r save-data}
saveRDS(riders_final, here("data", "derived_data", "riders_final.rds"))
saveRDS(riders_final_2019, here("data", "derived_data", "riders_final_2019.rds"))
saveRDS(acs_wa, here("data", "derived_data", "washington_acs.rds"))
saveRDS(wa_bgs, here("data", "derived_data", "washington_block-groups.rds"))
```