Code
library(tidyverse)
library(tidycensus)
library(sf)
library(lubridate)
# library(tsibble)
library(here)General model:
long_term_loading ~ different_incentives_categorical + age + race + language + issuing_office + home_address_FIPS?Outcomes:
load_after_six: Binary indicator for whether they reloaded the card 6+ months after card is issuedtotal_amount_after_six: Amount of money refilled 6+ months after card is issuedtotal_loadings_after_six: Count of refills 6+ months after card is issuedTreatment:
incentive_cat: Categorical variable showing the kind of incentive each person was given with the card, if any. Possible values are 0, 10, 15, 20, 30, 50, 70, Misc. Pass, Monthly Pass, Passport, and Subsidized Annual Pass; the values are ordered based on their intensityGeneral model:
reenrollment_in_orca ~ different_incentives_categorical + age + race + language + issuing_office + home_address_FIPS?Outcomes:
ever_reenroll: Binary indicator for whether the person ever reenrolled (every row for that person is all TRUE or all FALSE)reenrolled: Binary indicator for whether current card issuing is a reenrollment (TRUE when the suffix for the card ID is greater than 1)Treatment:
General models:
long_term_loading ~ subsidized_passes + other_stuffreenrollment_in_orca ~ subsidized_passes + other_stuffOutcomes:
reenroll_after_study: Binary indicator for whether the person reenrolled specifically after being assigned to one of the pseudo RCT treatment groups and receiving a passport previouslyTreatment:
treatment_passport_binary: Binary indicator for whether the person was in Phase 1 or Phase 2 of the pseudo RCT passport programOther details related to ridership and purchasing, like:
total_amount_before_sixtotal_loadings_before_sixenrolled_previouslyboardings_king_county_before_sixboardings_king_county_after_sixboardings_sound_transit_before_sixboardings_sound_transit_after_sixDetails included in the rider registry:
AgeRaceDescLanguageSpokenCardIssuingAgencyACS 2019 block-group-level data for each rider’s home address:
bg_population: Block-group populationbg_race: Block-group denominator for racebg_race_white: Count of white residentsbg_income: Median income in block-groupTODO
Note: GEOID in LIFT_registry.csv is 12 digits, so it’s for the Census block group (2 + 3 + 6 + 1, for state + county + tract + block group)
lift_boardings1_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county", 
                                     "Question 2_ Fare Subsidies", "LIFT_boardings.csv"))
lift_boardings2_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county", 
                                     "Question 2_ Fare Subsidies", 
                                     "LIFT_boardings_2021-11-01_to_2022-03-06.csv"))
lift_sales_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county", 
                                "Question 2_ Fare Subsidies", "LIFT_sales_2022-04-01.csv"))
lift_registry_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county", 
                                   "Question 2_ Fare Subsidies", "LIFT_registry_2022-04-01.csv"))incentives <- c("0", "10", "15", "20", "30", "50", "70", "Misc. Pass", 
                "Monthly Pass", "Passport", "Subsidized Annual Pass")
incentives_labs <- c("$0", "$10", "$15", "$20", "$30", "$50", "$70", "Misc. Pass", 
                     "Monthly Pass", "Passport", "Subsidized Annual Pass")
riders_clean <- lift_registry_raw %>% 
  # Split id column to get enrollment count
  mutate(card_id_orig = card_id) %>% 
  separate(card_id, into = c("id", "times"), sep = "-") %>% 
  # Clean up column types
  mutate(across(c(id, times), as.integer)) %>% 
  mutate(FIPS = as.character(FIPS)) %>% 
  # This date is a typo
  mutate(Expiration = ifelse(Expiration == "00534984", NA, Expiration)) %>% 
  # Make the expiration date an actual date
  mutate(Expiration = mdy(Expiration)) %>%
  # Exclude rows where the expiration date is before the issued date
  filter(Expiration > DateIssued) %>% 
  # Sometimes the same person gets enrolled twice in one day (like 2716-3)
  # Only keep the last row of duplicate issue dates for individuals
  group_by(id, DateIssued) %>% 
  slice_tail() %>% 
  # Renumber the times column
  # group_by(id) %>% 
  # mutate(times = 1:n()) %>% 
  # Get rid of extra columns
  ungroup() %>% 
  select(-duplicate) %>% 
  # Sort
  arrange(id, times)
riders_clean
## # A tibble: 116,791 × 12
##      Age RaceDesc              LanguageSpoken Expiration DateIssued CardIssuingAgen… FIPS 
##    <dbl> <chr>                 <chr>          <date>     <date>     <chr>            <chr>
##  1    36 Black or African Ame… Other          2019-08-31 2017-06-15 KCMCCS           5303…
##  2    56 White                 English        2019-07-31 2017-06-05 YWCA             5303…
##  3    58 White                 English        2021-11-30 2019-08-28 King County Pub… 5303…
##  4    49 Black or African Ame… English        2023-11-30 2022-01-26 DSHS - ORCA LIF… 5305…
##  5    46 Asian                 English        2021-05-31 2019-06-30 King County Pub… 5303…
##  6    63 Asian                 Chinese        2019-03-31 2017-03-31 Promo Account    5303…
##  7    64 Asian                 Chinese        2019-06-30 2018-06-02 Promo Account    5303…
##  8    59 Asian                 Chinese        2019-03-31 2017-03-31 Promo Account    5303…
##  9    60 Asian                 Chinese        2019-06-30 2018-06-02 Promo Account    5303…
## 10    31 Black or African Ame… Somali         2019-05-31 2017-04-13 KCMCCS           5303…
## # … with 116,781 more rows, and 5 more variables: `Initial Load` <chr>,
## #   `Study Card` <chr>, id <int>, times <int>, card_id_orig <chr>sales <- lift_sales_raw %>% 
  group_by(week, card_id) %>% 
  summarize(total_amount = sum(Amount),
            total_loadings = sum(loadings)) %>% 
  ungroup()# %>% 
  # as_tsibble(key = card_id, index = week)
sales
## # A tibble: 554,944 × 4
##    week       card_id total_amount total_loadings
##    <date>     <chr>          <dbl>          <dbl>
##  1 2017-03-05 11870-1           15              2
##  2 2017-03-05 12162-1           54              1
##  3 2017-03-05 13722-1           54              1
##  4 2017-03-05 19358-1           54              1
##  5 2017-03-05 24318-1           54              1
##  6 2017-03-05 33232-1           10              1
##  7 2017-03-05 64512-1           54              1
##  8 2017-03-05 84558-1           20              1
##  9 2017-03-05 93734-1           20              1
## 10 2017-03-05 93740-1           54              1
## # … with 554,934 more rows
# This takes a while
# sales_full_panel <- sales %>% 
#   fill_gaps(.full = TRUE) %>% 
#   replace_na(list(total_amount = 0, total_loadings = 0)) %>% 
#   mutate(card_id_orig = card_id) %>% 
#   separate(card_id, into = c("id", "times"), sep = "-") %>% 
#   as_tsibble(key = card_id_orig, index = week) %>% 
#   mutate(across(c(id, times), as.integer)) %>% 
#   mutate(year_week = yearweek(week, week_start = 1),
#          year_week_nice = format(year_week, "%V/%Y"),
#          year_month = yearmonth(week),
#          year_month_nice = format(year_month, "%m/%Y"))boardings <- lift_boardings1_raw %>%
  bind_rows(lift_boardings2_raw) %>% 
  select(week, card_id, 
         boardings_king_county = `King County Metro`,
         boardings_sound_transit = `Sound Transit`) %>% 
  mutate(card_id_orig = card_id) %>% 
  separate(card_id, into = c("id", "times"), sep = "-") %>% 
  mutate(across(c(id, times), as.integer)) %>% 
  mutate(across(starts_with("boardings"), ~replace_na(., 0)))# %>% 
  # as_tsibble(key = card_id_orig, index = week)
boardings
## # A tibble: 1,783,168 × 6
##    week          id times boardings_king_county boardings_sound_transit card_id_orig
##    <date>     <int> <int>                 <dbl>                   <dbl> <chr>       
##  1 2017-03-12 94262     1                     8                       0 94262-1     
##  2 2017-03-19 94262     1                     4                       0 94262-1     
##  3 2017-03-26 94262     1                     2                       0 94262-1     
##  4 2017-04-02 94262     1                     4                       0 94262-1     
##  5 2017-04-16 94262     1                     2                       0 94262-1     
##  6 2017-07-09 94262     1                     2                       0 94262-1     
##  7 2017-07-16 94262     1                     2                       0 94262-1     
##  8 2017-07-30 94262     1                     3                       0 94262-1     
##  9 2017-08-06 94262     1                     2                       0 94262-1     
## 10 2017-08-20 94262     1                     3                       0 94262-1     
## # … with 1,783,158 more rows# See a list of variable names
# Also available at https://api.census.gov/data/2020/acs/acs5/variables.html
# acs_possible_vars <- load_variables(2020, "acs5", cache = TRUE)
acs_vars <- tribble(
  ~name,        ~var_name,       ~description,
  "B01003_001", "bg_population",  "Total population",
  "B02001_001", "bg_race_denom",  "Race denominator",
  "B02001_002", "bg_race_white",  "Race: White alone",
  "B19013_001", "bg_income",      "Median household income",
  "B01001_001", "bg_age_denom",   "Age denominator",
  "B01001_002", "bg_male",        "Male",
  "B15003_001", "bg_educ_denom",  "Education denominator",
  "B15003_017", "bg_hs",          "High school",
  "B15003_022", "bg_ba",          "Bachelor's degree",
  "B08134_061", "bg_pub_trans",   "Minutes on public transportation",
  "B08303_001", "bg_travel_time", "Total travel time",
  "B25064_001", "bg_median_rent", "Median rent",
  "B28002_001", "bg_internet_denom", "Internet denominator",
  "B28002_013", "bg_no_internet", "No internet"
)
# Create a named vector to pass to get_acs()
vars_to_get <- acs_vars$name %>% 
  set_names(acs_vars$var_name)
# Get 2019 ACS data
# 2020 would be neat, but ≈20% of it is missing, ugh
# 2020 decennial would be neat too, but it's a huge mess
acs_raw <- get_acs(geography = "block group", 
                   variables = vars_to_get,
                   state = 53, year = 2019, survey = "acs5")
# Make the data wide
acs_wa <- acs_raw %>% 
  select(-NAME, -moe) %>% 
  pivot_wider(names_from = "variable", values_from = "estimate") %>% 
  mutate(bg_pct_white = bg_race_white / bg_race_denom,
         bg_pct_male = bg_male / bg_population,
         bg_travel_time_per_capita = bg_travel_time / bg_population,
         bg_pub_trans_per_capita = bg_pub_trans / bg_population,
         bg_pct_hs = bg_hs / bg_educ_denom,
         bg_pct_ba = bg_ba / bg_educ_denom,
         bg_internet = 1 - (bg_no_internet / bg_internet_denom)) %>% 
  select(GEOID, bg_pct_white, bg_pct_male, bg_pct_hs, bg_pct_ba, 
         bg_internet, bg_travel_time_per_capita, bg_pub_trans_per_capita, 
         bg_income, bg_median_rent)
# Shapefiles for WA block groups
# Get block group boundaries
wa_bgs <- tigris::block_groups(state = 53, cb = FALSE, year = 2019)# Make a smaller dataset of just IDs and enrollment dates and then make a column
# of enrollment dates + 180 days - we'll use this to process and collapse the
# sales and boarding data
rider_enrollment_dates <- riders_clean %>% 
  select(id, times, DateIssued) %>% 
  mutate(six_months_later = DateIssued + days(180))
rider_enrollment_dates
## # A tibble: 116,791 × 4
##       id times DateIssued six_months_later
##    <int> <int> <date>     <date>          
##  1   568     1 2017-06-15 2017-12-12      
##  2  2690     1 2017-06-05 2017-12-02      
##  3  2690     2 2019-08-28 2020-02-24      
##  4  2704     1 2022-01-26 2022-07-25      
##  5  2706     1 2019-06-30 2019-12-27      
##  6  2708     1 2017-03-31 2017-09-27      
##  7  2708     2 2018-06-02 2018-11-29      
##  8  2710     1 2017-03-31 2017-09-27      
##  9  2710     2 2018-06-02 2018-11-29      
## 10  2716     1 2017-04-13 2017-10-10      
## # … with 116,781 more rows
# Calculate how much money riders put on cards + frequency of refills in the 0-6
# and 6+ months after getting issued a card
sales_before_after_six_months <- sales %>%
  # Split id column to get enrollment count
  mutate(card_id_orig = card_id) %>% 
  separate(card_id, into = c("id", "times"), sep = "-") %>% 
  # Clean up column types
  mutate(across(c(id, times), as.integer)) %>% 
  # Bring in rider dates
  left_join(rider_enrollment_dates, by = c("id", "times")) %>% 
  # Get rid of rider/time combinations that don't exist
  filter(!is.na(DateIssued)) %>% 
  # Create indicator for whether each sale is 6+ months after the initial care issue
  mutate(after_six = (week - days(6)) > six_months_later,
         after_six = ifelse(after_six, "after_six", "before_six")) %>% 
  # Get total of money loaded and frequency of loading before/after 6+ months
  group_by(id, times, after_six) %>% 
  summarize(across(c(total_amount, total_loadings), ~sum(.))) %>% 
  ungroup() %>% 
  # Make wide
  pivot_wider(names_from = after_six, values_from = c(total_amount, total_loadings)) %>% 
  mutate(load_after_six = total_amount_after_six > 0)
sales_before_after_six_months
## # A tibble: 61,047 × 7
##       id times total_amount_after_six total_amount_befo… total_loadings_… total_loadings_…
##    <int> <int>                  <dbl>              <dbl>            <dbl>            <dbl>
##  1   568     1                     30               NA                  2               NA
##  2  2690     1                    310              293.                15               10
##  3  2706     1                     NA               82.5               NA                3
##  4  2708     1                     NA                6                 NA                1
##  5  2710     1                     NA               11                 NA                1
##  6  2716     1                    900              200                  7                1
##  7  2716     3                   1003              210                 19               17
##  8  2718     1                    498              335                 10                9
##  9  2718     2                     NA              220                 NA                8
## 10  2720     1                     NA                3                 NA                1
## # … with 61,037 more rows, and 1 more variable: load_after_six <lgl>
boardings_before_after_six_months <- boardings %>%
  # Bring in rider dates
  left_join(rider_enrollment_dates, by = c("id", "times")) %>% 
  # Get rid of rider/time combinations that don't exist
  filter(!is.na(DateIssued)) %>% 
  # Create indicator for whether each sale is 6+ months after the initial care issue
  mutate(after_six = (week - days(6)) > six_months_later,
         after_six = ifelse(after_six, "after_six", "before_six")) %>% 
  # Get total boardings before/after 6+ months
  group_by(id, times, after_six) %>% 
  summarize(across(starts_with("boardings_"), ~sum(.))) %>% 
  ungroup() %>% 
  # Make wide
  pivot_wider(names_from = after_six, values_from = starts_with("boardings_")) 
boardings_before_after_six_months
## # A tibble: 87,963 × 6
##       id times boardings_king_county_a… boardings_king_… boardings_sound… boardings_sound…
##    <int> <int>                    <dbl>            <dbl>            <dbl>            <dbl>
##  1   568     1                        0               NA                2               NA
##  2  2690     1                      310              247              194               68
##  3  2690     2                        0                1                3                0
##  4  2704     1                       NA                2               NA                2
##  5  2706     1                       14               34                0               32
##  6  2708     1                        5                2                0                0
##  7  2708     2                       NA                0               NA                6
##  8  2710     1                        7                2                0                1
##  9  2710     2                        0               NA                2               NA
## 10  2716     1                     1358              442              564               25
## # … with 87,953 more rows
riders_final <- riders_clean %>% 
  # Create treatment variables
  # Consider NA initial loads to be 0
  replace_na(list(`Initial Load` = "0")) %>% 
  mutate(incentive_cat = factor(`Initial Load`, levels = incentives, 
                                labels = incentives_labs, ordered = TRUE),
         treatment_passport_binary = !is.na(`Study Card`),
         treatment_sap_binary = incentive_cat == "Subsidized Annual Pass") %>%
  group_by(id) %>% 
  mutate(incentive_cat_collapsed = fct_collapse(
    incentive_cat,
    ">$10" = c("$15", "$20", "$30", "$50", "$70"),
    "Shorter Pass" = c("Misc. Pass", "Monthly Pass", "Passport"))) %>% 
  mutate(across(c(incentive_cat, incentive_cat_collapsed, treatment_passport_binary, treatment_sap_binary), 
                ~lag(.), .names = "{.col}_prev")) %>% 
  #
  # Create outcome variables
  # Bring in collapsed sales data for reloading outcome
  left_join(sales_before_after_six_months, by = c("id", "times")) %>% 
  # Create reenrollment outcomes
  group_by(id) %>% 
  mutate(ever_reenroll = any(times > 1),
         reenrolled = times > 1,
         enrolled_previously = n() != 1,
         reenroll_after_study = treatment_passport_binary_prev & ever_reenroll) %>% 
  ungroup() %>% 
  #
  # Create controls and confounders
  # Bring in collapsed boarding data for ride use history
  left_join(boardings_before_after_six_months, by = c("id", "times")) %>% 
  # Bring in census data
  left_join(acs_wa, by = c("FIPS" = "GEOID")) %>% 
  #
  # Final data cleaning
  # Replace NAs with actual data
  mutate(across(c(starts_with("total_amount"), starts_with("total_loadings"),
                  starts_with("boardings_")), 
                ~replace_na(., 0))) %>% 
  # Add "Nothing" as a category
  mutate(across(starts_with("incentive_cat"), ~fct_expand(.x, "Nothing"))) %>%
  replace_na(list(load_after_six = FALSE, treatment_passport_binary_prev = FALSE,
                  treatment_sap_binary_prev = FALSE,
                  incentive_cat_prev = "Nothing", incentive_cat_collapsed_prev = "Nothing")) %>% 
  # Put "Nothing" first so it's the reference category
  mutate(across(starts_with("incentive_cat"), ~fct_relevel(.x, "Nothing"))) %>% 
  # Get rid of "Nothing" if there aren't any
  mutate(across(starts_with("incentive_cat"), ~fct_drop(.x, only = "Nothing")))
# MAYBE: Total money spent before 2019/2020 enrollment
# MAYBE: Total refills before 2019/2020 enrollment
# Only look at rows starting on March 13, 2019, since that's when the study
# formally began
riders_final_2019 <- riders_final %>% 
  filter(DateIssued >= ymd("2019-03-13"))
glimpse(riders_final_2019)
## Rows: 61,277
## Columns: 42
## $ Age                                <dbl> 58, 49, 46, 33, 63, 47, 57, 59, 56, 49, 51, 3…
## $ RaceDesc                           <chr> "White", "Black or African American", "Asian"…
## $ LanguageSpoken                     <chr> "English", "English", "English", "Somali", "E…
## $ Expiration                         <date> 2021-11-30, 2023-11-30, 2021-05-31, 2021-08-…
## $ DateIssued                         <date> 2019-08-28, 2022-01-26, 2019-06-30, 2019-06-…
## $ CardIssuingAgency                  <chr> "King County Public Health", "DSHS - ORCA LIF…
## $ FIPS                               <chr> "530330040001", "530530609051", "530330061002…
## $ `Initial Load`                     <chr> "0", "10", "0", "0", "0", "0", "0", "10", "10…
## $ `Study Card`                       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ id                                 <int> 2690, 2704, 2706, 2716, 2718, 2720, 2724, 272…
## $ times                              <int> 2, 1, 1, 3, 2, 2, 2, 3, 3, 2, 3, 1, 3, 1, 2, …
## $ card_id_orig                       <chr> "2690-2", "2704-1", "2706-1", "2716-3", "2718…
## $ incentive_cat                      <ord> $0, $10, $0, $0, $0, $0, $0, $10, $10, $0, $1…
## $ treatment_passport_binary          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ treatment_sap_binary               <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ incentive_cat_collapsed            <ord> $0, $10, $0, $0, $0, $0, $0, $10, $10, $0, $1…
## $ incentive_cat_prev                 <ord> $0, Nothing, Nothing, $0, $0, $0, $10, $0, Mo…
## $ incentive_cat_collapsed_prev       <ord> $0, Nothing, Nothing, $0, $0, $0, $10, $0, Sh…
## $ treatment_passport_binary_prev     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ treatment_sap_binary_prev          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ total_amount_after_six             <dbl> 0, 0, 0, 1003, 0, 0, 10, 0, 0, 220, 0, 0, 0, …
## $ total_amount_before_six            <dbl> 0.0, 0.0, 82.5, 210.0, 220.0, 0.0, 18.0, 0.0,…
## $ total_loadings_after_six           <dbl> 0, 0, 0, 19, 0, 0, 1, 0, 0, 11, 0, 0, 0, 0, 4…
## $ total_loadings_before_six          <dbl> 0, 0, 3, 17, 8, 0, 2, 0, 0, 4, 3, 2, 0, 0, 0,…
## $ load_after_six                     <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE…
## $ ever_reenroll                      <lgl> TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, T…
## $ reenrolled                         <lgl> TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, T…
## $ enrolled_previously                <lgl> TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, T…
## $ reenroll_after_study               <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ boardings_king_county_after_six    <dbl> 0, 0, 14, 370, 0, 0, 6, 0, 231, 299, 0, 8, 0,…
## $ boardings_king_county_before_six   <dbl> 1, 2, 34, 304, 171, 0, 15, 0, 155, 91, 53, 42…
## $ boardings_sound_transit_after_six  <dbl> 3, 0, 0, 133, 0, 0, 4, 0, 6, 3, 0, 2, 0, 0, 9…
## $ boardings_sound_transit_before_six <dbl> 0, 2, 32, 165, 4, 0, 0, 0, 4, 5, 1, 6, 0, 0, …
## $ bg_pct_white                       <dbl> 0.631, 0.698, 0.802, 0.158, 0.562, 0.627, 0.6…
## $ bg_pct_male                        <dbl> 0.460, 0.467, 0.481, 0.513, 0.528, 0.483, 0.5…
## $ bg_pct_hs                          <dbl> 0.0526, 0.2432, 0.0330, 0.1362, 0.0738, 0.099…
## $ bg_pct_ba                          <dbl> 0.316, 0.129, 0.280, 0.289, 0.252, 0.445, 0.2…
## $ bg_internet                        <dbl> 0.981, 0.689, 0.937, 0.783, 0.945, 1.000, 1.0…
## $ bg_travel_time_per_capita          <dbl> 0.504, 0.391, 0.544, 0.381, 0.647, 0.641, 0.4…
## $ bg_pub_trans_per_capita            <dbl> 0.11825, 0.00869, 0.12396, 0.15455, 0.21778, …
## $ bg_income                          <dbl> 71500, 47917, 115176, 43125, 60739, 70047, 10…
## $ bg_median_rent                     <dbl> 1343, 974, 1484, 1019, 1276, 1410, 1299, 1299…---
title: "Data cleaning"
editor_options: 
  chunk_output_type: inline
format:
  html:
    code-fold: "show"
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(fig.align = "center", fig.retina = 2,
                      fig.width = 6, fig.height = (6 * 0.618),
                      out.width = "80%", collapse = TRUE,
                      dev = "png", dev.args = list(type = "cairo-png"))
options(digits = 3, width = 90,
        dplyr.summarise.inform = FALSE)
```
```{r load-packages, warning=FALSE, message=FALSE}
library(tidyverse)
library(tidycensus)
library(sf)
library(lubridate)
# library(tsibble)
library(here)
```
# Research questions and variables
## Q2
### Q2-A: Effect of different levels of incentives on longer-term loading of value and passes 
General model:
- `long_term_loading ~ different_incentives_categorical + age + race + language + issuing_office + home_address_FIPS?`
Outcomes: 
- `load_after_six`: Binary indicator for whether they reloaded the card 6+ months after card is issued
- `total_amount_after_six`: Amount of money refilled 6+ months after card is issued
- `total_loadings_after_six`: Count of refills 6+ months after card is issued
Treatment: 
- `incentive_cat`: Categorical variable showing the kind of incentive each person was given with the card, if any. Possible values are 0, 10, 15, 20, 30, 50, 70, Misc. Pass, Monthly Pass, Passport, and Subsidized Annual Pass; the values are ordered based on their intensity
### Q2-B: Effect of different levels of incentives on re-enrollment in ORCA LIFT
General model:
- `reenrollment_in_orca ~ different_incentives_categorical + age + race + language + issuing_office + home_address_FIPS?`
Outcomes: 
- `ever_reenroll`: Binary indicator for whether the person ever reenrolled (every row for that person is all TRUE or all FALSE)
- `reenrolled`: Binary indicator for whether current card issuing is a reenrollment (TRUE when the suffix for the card ID is greater than 1)
Treatment: 
- Same as Q2-A
### Q2-C: Effects of subsidized passes through RCTs on these two outcome variables
General models: 
- `long_term_loading ~ subsidized_passes + other_stuff`
- `reenrollment_in_orca ~ subsidized_passes + other_stuff`
Outcomes: 
- Same as Q2-A and Q2-B, and maybe...
- `reenroll_after_study`: Binary indicator for whether the person reenrolled specifically after being assigned to one of the pseudo RCT treatment groups and receiving a passport previously
Treatment:
- Same as Q2-A and Q2-B, and maybe...
- `treatment_passport_binary`: Binary indicator for whether the person was in Phase 1 or Phase 2 of the pseudo RCT passport program
### Controls
Other details related to ridership and purchasing, like:
- `total_amount_before_six`
- `total_loadings_before_six`
- `enrolled_previously`
- `boardings_king_county_before_six`
- `boardings_king_county_after_six`
- `boardings_sound_transit_before_six`
- `boardings_sound_transit_after_six`
Details included in the rider registry:
- `Age`
- `RaceDesc`
- `LanguageSpoken`
- `CardIssuingAgency`
ACS 2019 block-group-level data for each rider's home address:
- `bg_population`: Block-group population
- `bg_race`: Block-group denominator for race
- `bg_race_white`: Count of white residents
- `bg_income`: Median income in block-group
- And many others TODO
# Data cleaning for Q1
TODO
# Data cleaning for Q2
Note: `GEOID` in `LIFT_registry.csv` is 12 digits, so it's for the [Census block group](https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html) (`2 + 3 + 6 + 1`, for `state + county + tract + block group`)
```{r load-q2-data, warning=FALSE, message=FALSE}
lift_boardings1_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county", 
                                     "Question 2_ Fare Subsidies", "LIFT_boardings.csv"))
lift_boardings2_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county", 
                                     "Question 2_ Fare Subsidies", 
                                     "LIFT_boardings_2021-11-01_to_2022-03-06.csv"))
lift_sales_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county", 
                                "Question 2_ Fare Subsidies", "LIFT_sales_2022-04-01.csv"))
lift_registry_raw <- read_csv(here("data", "raw_data", "raw_data_from_king_county", 
                                   "Question 2_ Fare Subsidies", "LIFT_registry_2022-04-01.csv"))
```
## Rider data
```{r clean-riders-data}
incentives <- c("0", "10", "15", "20", "30", "50", "70", "Misc. Pass", 
                "Monthly Pass", "Passport", "Subsidized Annual Pass")
incentives_labs <- c("$0", "$10", "$15", "$20", "$30", "$50", "$70", "Misc. Pass", 
                     "Monthly Pass", "Passport", "Subsidized Annual Pass")
riders_clean <- lift_registry_raw %>% 
  # Split id column to get enrollment count
  mutate(card_id_orig = card_id) %>% 
  separate(card_id, into = c("id", "times"), sep = "-") %>% 
  # Clean up column types
  mutate(across(c(id, times), as.integer)) %>% 
  mutate(FIPS = as.character(FIPS)) %>% 
  # This date is a typo
  mutate(Expiration = ifelse(Expiration == "00534984", NA, Expiration)) %>% 
  # Make the expiration date an actual date
  mutate(Expiration = mdy(Expiration)) %>%
  # Exclude rows where the expiration date is before the issued date
  filter(Expiration > DateIssued) %>% 
  # Sometimes the same person gets enrolled twice in one day (like 2716-3)
  # Only keep the last row of duplicate issue dates for individuals
  group_by(id, DateIssued) %>% 
  slice_tail() %>% 
  # Renumber the times column
  # group_by(id) %>% 
  # mutate(times = 1:n()) %>% 
  # Get rid of extra columns
  ungroup() %>% 
  select(-duplicate) %>% 
  # Sort
  arrange(id, times)
riders_clean
```
## Sales data
```{r clean-sales-data}
sales <- lift_sales_raw %>% 
  group_by(week, card_id) %>% 
  summarize(total_amount = sum(Amount),
            total_loadings = sum(loadings)) %>% 
  ungroup()# %>% 
  # as_tsibble(key = card_id, index = week)
sales
# This takes a while
# sales_full_panel <- sales %>% 
#   fill_gaps(.full = TRUE) %>% 
#   replace_na(list(total_amount = 0, total_loadings = 0)) %>% 
#   mutate(card_id_orig = card_id) %>% 
#   separate(card_id, into = c("id", "times"), sep = "-") %>% 
#   as_tsibble(key = card_id_orig, index = week) %>% 
#   mutate(across(c(id, times), as.integer)) %>% 
#   mutate(year_week = yearweek(week, week_start = 1),
#          year_week_nice = format(year_week, "%V/%Y"),
#          year_month = yearmonth(week),
#          year_month_nice = format(year_month, "%m/%Y"))
```
## Boardings data
```{r clean-boardings-data}
boardings <- lift_boardings1_raw %>%
  bind_rows(lift_boardings2_raw) %>% 
  select(week, card_id, 
         boardings_king_county = `King County Metro`,
         boardings_sound_transit = `Sound Transit`) %>% 
  mutate(card_id_orig = card_id) %>% 
  separate(card_id, into = c("id", "times"), sep = "-") %>% 
  mutate(across(c(id, times), as.integer)) %>% 
  mutate(across(starts_with("boardings"), ~replace_na(., 0)))# %>% 
  # as_tsibble(key = card_id_orig, index = week)
boardings
```
## Census data
```{r get-clean-census, cache=TRUE, message=FALSE, warning=FALSE, results="hide"}
# See a list of variable names
# Also available at https://api.census.gov/data/2020/acs/acs5/variables.html
# acs_possible_vars <- load_variables(2020, "acs5", cache = TRUE)
acs_vars <- tribble(
  ~name,        ~var_name,       ~description,
  "B01003_001", "bg_population",  "Total population",
  "B02001_001", "bg_race_denom",  "Race denominator",
  "B02001_002", "bg_race_white",  "Race: White alone",
  "B19013_001", "bg_income",      "Median household income",
  "B01001_001", "bg_age_denom",   "Age denominator",
  "B01001_002", "bg_male",        "Male",
  "B15003_001", "bg_educ_denom",  "Education denominator",
  "B15003_017", "bg_hs",          "High school",
  "B15003_022", "bg_ba",          "Bachelor's degree",
  "B08134_061", "bg_pub_trans",   "Minutes on public transportation",
  "B08303_001", "bg_travel_time", "Total travel time",
  "B25064_001", "bg_median_rent", "Median rent",
  "B28002_001", "bg_internet_denom", "Internet denominator",
  "B28002_013", "bg_no_internet", "No internet"
)
# Create a named vector to pass to get_acs()
vars_to_get <- acs_vars$name %>% 
  set_names(acs_vars$var_name)
# Get 2019 ACS data
# 2020 would be neat, but ≈20% of it is missing, ugh
# 2020 decennial would be neat too, but it's a huge mess
acs_raw <- get_acs(geography = "block group", 
                   variables = vars_to_get,
                   state = 53, year = 2019, survey = "acs5")
# Make the data wide
acs_wa <- acs_raw %>% 
  select(-NAME, -moe) %>% 
  pivot_wider(names_from = "variable", values_from = "estimate") %>% 
  mutate(bg_pct_white = bg_race_white / bg_race_denom,
         bg_pct_male = bg_male / bg_population,
         bg_travel_time_per_capita = bg_travel_time / bg_population,
         bg_pub_trans_per_capita = bg_pub_trans / bg_population,
         bg_pct_hs = bg_hs / bg_educ_denom,
         bg_pct_ba = bg_ba / bg_educ_denom,
         bg_internet = 1 - (bg_no_internet / bg_internet_denom)) %>% 
  select(GEOID, bg_pct_white, bg_pct_male, bg_pct_hs, bg_pct_ba, 
         bg_internet, bg_travel_time_per_capita, bg_pub_trans_per_capita, 
         bg_income, bg_median_rent)
# Shapefiles for WA block groups
# Get block group boundaries
wa_bgs <- tigris::block_groups(state = 53, cb = FALSE, year = 2019)
```
## Merge data
```{r merge-data}
# Make a smaller dataset of just IDs and enrollment dates and then make a column
# of enrollment dates + 180 days - we'll use this to process and collapse the
# sales and boarding data
rider_enrollment_dates <- riders_clean %>% 
  select(id, times, DateIssued) %>% 
  mutate(six_months_later = DateIssued + days(180))
rider_enrollment_dates
# Calculate how much money riders put on cards + frequency of refills in the 0-6
# and 6+ months after getting issued a card
sales_before_after_six_months <- sales %>%
  # Split id column to get enrollment count
  mutate(card_id_orig = card_id) %>% 
  separate(card_id, into = c("id", "times"), sep = "-") %>% 
  # Clean up column types
  mutate(across(c(id, times), as.integer)) %>% 
  # Bring in rider dates
  left_join(rider_enrollment_dates, by = c("id", "times")) %>% 
  # Get rid of rider/time combinations that don't exist
  filter(!is.na(DateIssued)) %>% 
  # Create indicator for whether each sale is 6+ months after the initial care issue
  mutate(after_six = (week - days(6)) > six_months_later,
         after_six = ifelse(after_six, "after_six", "before_six")) %>% 
  # Get total of money loaded and frequency of loading before/after 6+ months
  group_by(id, times, after_six) %>% 
  summarize(across(c(total_amount, total_loadings), ~sum(.))) %>% 
  ungroup() %>% 
  # Make wide
  pivot_wider(names_from = after_six, values_from = c(total_amount, total_loadings)) %>% 
  mutate(load_after_six = total_amount_after_six > 0)
sales_before_after_six_months
boardings_before_after_six_months <- boardings %>%
  # Bring in rider dates
  left_join(rider_enrollment_dates, by = c("id", "times")) %>% 
  # Get rid of rider/time combinations that don't exist
  filter(!is.na(DateIssued)) %>% 
  # Create indicator for whether each sale is 6+ months after the initial care issue
  mutate(after_six = (week - days(6)) > six_months_later,
         after_six = ifelse(after_six, "after_six", "before_six")) %>% 
  # Get total boardings before/after 6+ months
  group_by(id, times, after_six) %>% 
  summarize(across(starts_with("boardings_"), ~sum(.))) %>% 
  ungroup() %>% 
  # Make wide
  pivot_wider(names_from = after_six, values_from = starts_with("boardings_")) 
boardings_before_after_six_months
riders_final <- riders_clean %>% 
  # Create treatment variables
  # Consider NA initial loads to be 0
  replace_na(list(`Initial Load` = "0")) %>% 
  mutate(incentive_cat = factor(`Initial Load`, levels = incentives, 
                                labels = incentives_labs, ordered = TRUE),
         treatment_passport_binary = !is.na(`Study Card`),
         treatment_sap_binary = incentive_cat == "Subsidized Annual Pass") %>%
  group_by(id) %>% 
  mutate(incentive_cat_collapsed = fct_collapse(
    incentive_cat,
    ">$10" = c("$15", "$20", "$30", "$50", "$70"),
    "Shorter Pass" = c("Misc. Pass", "Monthly Pass", "Passport"))) %>% 
  mutate(across(c(incentive_cat, incentive_cat_collapsed, treatment_passport_binary, treatment_sap_binary), 
                ~lag(.), .names = "{.col}_prev")) %>% 
  #
  # Create outcome variables
  # Bring in collapsed sales data for reloading outcome
  left_join(sales_before_after_six_months, by = c("id", "times")) %>% 
  # Create reenrollment outcomes
  group_by(id) %>% 
  mutate(ever_reenroll = any(times > 1),
         reenrolled = times > 1,
         enrolled_previously = n() != 1,
         reenroll_after_study = treatment_passport_binary_prev & ever_reenroll) %>% 
  ungroup() %>% 
  #
  # Create controls and confounders
  # Bring in collapsed boarding data for ride use history
  left_join(boardings_before_after_six_months, by = c("id", "times")) %>% 
  # Bring in census data
  left_join(acs_wa, by = c("FIPS" = "GEOID")) %>% 
  #
  # Final data cleaning
  # Replace NAs with actual data
  mutate(across(c(starts_with("total_amount"), starts_with("total_loadings"),
                  starts_with("boardings_")), 
                ~replace_na(., 0))) %>% 
  # Add "Nothing" as a category
  mutate(across(starts_with("incentive_cat"), ~fct_expand(.x, "Nothing"))) %>%
  replace_na(list(load_after_six = FALSE, treatment_passport_binary_prev = FALSE,
                  treatment_sap_binary_prev = FALSE,
                  incentive_cat_prev = "Nothing", incentive_cat_collapsed_prev = "Nothing")) %>% 
  # Put "Nothing" first so it's the reference category
  mutate(across(starts_with("incentive_cat"), ~fct_relevel(.x, "Nothing"))) %>% 
  # Get rid of "Nothing" if there aren't any
  mutate(across(starts_with("incentive_cat"), ~fct_drop(.x, only = "Nothing")))
# MAYBE: Total money spent before 2019/2020 enrollment
# MAYBE: Total refills before 2019/2020 enrollment
# Only look at rows starting on March 13, 2019, since that's when the study
# formally began
riders_final_2019 <- riders_final %>% 
  filter(DateIssued >= ymd("2019-03-13"))
glimpse(riders_final_2019)
```
## Save data
```{r save-data}
saveRDS(riders_final, here("data", "derived_data", "riders_final.rds"))
saveRDS(riders_final_2019, here("data", "derived_data", "riders_final_2019.rds"))
saveRDS(acs_wa, here("data", "derived_data", "washington_acs.rds"))
saveRDS(wa_bgs, here("data", "derived_data", "washington_block-groups.rds"))
```