2 Process, clean, and reshape data

library(tidyverse)

Popular conjoint survey platforms like Qualtrics and Sawtooth typically provide results data in two separate data files: (1) individual participant-level responses and (2) a bank of possible alternative-level combinations of features. To analyze the results of a conjoint experiment, the two datasets need to be joined.

2.1 Individual-level responses

responses <- readRDS(here::here("data", "processed_data", "responses_illustration.rds"))
responses

# A tibble: 295 × 15
   resp_id gender   age CBC_Random1 CBC_Random2 CBC_Random3 CBC_Random4
     <dbl> <chr>  <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
 1       4 Female    19           1           2           1           1
 2       5 Female    19           2           1           2           2
 3       6 Male      20           2           1           1           2
 4       7 Female    20           2           2           1           2
 5       8 Female    20           2           2           2           2
 6       9 Male      20           1           2           1           1
 7      10 Female    19           2           2           2           1
 8      11 Male      22           2           1           1           1
 9      12 Male      20           1           1           2           1
10      13 Female    19           2           2           2           2
# ℹ 285 more rows
# ℹ 8 more variables: CBC_Random5 <dbl>, CBC_Random6 <dbl>, CBC_Random7 <dbl>,
#   CBC_Random8 <dbl>, CBC_Random9 <dbl>, CBC_Random10 <dbl>,
#   CBC_Random11 <dbl>, CBC_Random12 <dbl>

2.2 Possible alternatives

alternatives <- readRDS(here::here("data", "processed_data", "alternatives_illustration.rds"))
alternatives

# A tibble: 7,200 × 6
   version question   alt price packaging         flavor   
     <dbl>    <dbl> <dbl> <fct> <fct>             <fct>    
 1       1        1     1 $2    Plastic + paper   Chocolate
 2       1        1     2 $4    Plastic + sticker Nuts     
 3       1        2     1 $3    Plastic + sticker Nuts     
 4       1        2     2 $4    Plastic + paper   Chocolate
 5       1        3     1 $2    Plastic + paper   Chocolate
 6       1        3     2 $3    Plastic + sticker Nuts     
 7       1        4     1 $2    Plastic + paper   Chocolate
 8       1        4     2 $3    Plastic + paper   Nuts     
 9       1        5     1 $4    Plastic + sticker Nuts     
10       1        5     2 $4    Plastic + sticker Chocolate
# ℹ 7,190 more rows

alternatives |> 
  summarize(
    versions = n_distinct(version),
    questions = n_distinct(question),
    alts = n_distinct(alt)
  )

# A tibble: 1 × 3
  versions questions  alts
     <int>     <int> <int>
1      300        12     2

2.3 Pivoting and expanding

The original responses data is wide, with a column for each of the 12 choices. We first need to make it long, with a row for each respondent-choice

responses_long <- responses |> 
  pivot_longer(
    cols = starts_with("CBC_Random"),
    names_to = "question_raw",
    values_to = "chosen_alt"
  ) %>%
  # The task number is embedded in text, like "CBC_Random6"; this extracts it
  mutate(question = as.numeric(str_extract(question_raw, "\\d+"))) %>%
  select(-question_raw)
responses_long

# A tibble: 3,540 × 5
   resp_id gender   age chosen_alt question
     <dbl> <chr>  <dbl>      <dbl>    <dbl>
 1       4 Female    19          1        1
 2       4 Female    19          2        2
 3       4 Female    19          1        3
 4       4 Female    19          1        4
 5       4 Female    19          1        5
 6       4 Female    19          1        6
 7       4 Female    19          2        7
 8       4 Female    19          1        8
 9       4 Female    19          1        9
10       4 Female    19          1       10
# ℹ 3,530 more rows

responses_long |> 
  summarize(
    resp_ids = n_distinct(resp_id),
    questions = n_distinct(question)
  )

# A tibble: 1 × 2
  resp_ids questions
     <int>     <int>
1      295        12

responses_long_expanded <- responses_long |>
  expand(resp_id, question, alt = 1:2) |> 
  left_join(responses_long, by = join_by(resp_id, question))
responses_long_expanded

# A tibble: 7,080 × 6
   resp_id question   alt gender   age chosen_alt
     <dbl>    <dbl> <int> <chr>  <dbl>      <dbl>
 1       4        1     1 Female    19          1
 2       4        1     2 Female    19          1
 3       4        2     1 Female    19          2
 4       4        2     2 Female    19          2
 5       4        3     1 Female    19          1
 6       4        3     2 Female    19          1
 7       4        4     1 Female    19          1
 8       4        4     2 Female    19          1
 9       4        5     1 Female    19          1
10       4        5     2 Female    19          1
# ℹ 7,070 more rows

responses_long_expanded |> 
  summarize(
    resp_ids = n_distinct(resp_id),
    questions = n_distinct(question),
    alts = n_distinct(alt)
  )

# A tibble: 1 × 3
  resp_ids questions  alts
     <int>     <int> <int>
1      295        12     2

2.4 Final data

combined <- responses_long_expanded |> 
  left_join(alternatives, by = join_by(resp_id == version, question, alt)) |> 
  mutate(choice = as.numeric(alt == chosen_alt))

nrow(combined) / 2 / 12

[1] 295