9.1 Setup

9.1.1 Retrieve situation states

retro2016 <- retro2016 |>
  abdwr3edata::retrosheet_add_states() #from chapter 5
colnames(retro2016[,99:107])
## [1] "bases"       "state"       "is_runner1"  "is_runner2"  "is_runner3" 
## [6] "new_outs"    "new_bases"   "new_state"   "runs_scored"

9.1.2 Sum runs and ID half innings

half_innings <- retro2016 |>
  mutate(
    runs = away_score_ct + home_score_ct,
    half_inning_id = paste(game_id, inn_ct, bat_home_id)
  ) |>
  group_by(half_inning_id) |>
  summarize(
    outs_inning = sum(event_outs_ct), 
    runs_start = first(runs),
    runs_inning = sum(runs_scored),
    max_runs = runs_start + runs_inning
  )
nrow(half_innings)
## [1] 43420

9.1.3 Meaningful plays

  • retain observations with change in state or runs scored
  • complete innings (i.e. three outs)
  • ignore: steals, caught stealing, wild pitches, passed balls
retro2016_complete <- retro2016 |> 
  mutate(
    half_inning_id = paste(game_id, inn_ct, bat_home_id)
  ) |>
  inner_join(half_innings, join_by(half_inning_id)) |>
  filter(state != new_state | runs_scored > 0) |> 
  filter(outs_inning == 3, bat_event_fl)
print(paste0(round(100*(nrow(retro2016_complete) / nrow(retro2016))), " percent"))
## [1] "96 percent"

9.1.4 End of innings

In our definition of the new_state variable, we recorded the runner locations when there were three outs. The runner locations don’t matter, so we recode new_state to always have the value 3 when the number of outs is equal to 3.

retro2016_complete <- retro2016_complete |>
  mutate(new_state = str_replace(new_state, "[0-1]{3} 3", "3"))
table(retro2016_complete$new_state)
## 
## 000 0 000 1 000 2 001 0 001 1 001 2 010 0 010 1 010 2 011 0 011 1 011 2 100 0 
##  2088 32533 25974   349  1518  2447  2698  4496  5661   526  1265  1453 10914 
## 100 1 100 2 101 0 101 1 101 2 110 0 110 1 110 2 111 0 111 1 111 2     3 
## 13097 13206   940  2047  2749  2539  4571  5643   688  1666  1946 42803