12.2 Acquiring a Year’s Worth of Statcast Data

Let’s say that we want to retrieve the full 2023 season data from Statcast.

# getting 2023 season statcast data
# data_dir <- "./data"
# statcast_dir <- path(data_dir, "sc_2023")
# if (!dir.exists(statcast_dir)) {
#      dir.create(statcast_dir)
# }
# 
# statcast_season(year = 2023, dir = statcast_dir)
# 
# sc2023 <- statcast_dir |> 
#      statcast_read_csv(pattern = "sc_2023.+\\.csv")

Do the same process for the 2021 and 2022 season, changing the corresponding year.

Now, let’s verify the validity of the 2023 season data.

tempfile_loc <- tempfile()
url <- 'https://statcast-data.atl1.digitaloceanspaces.com/statcast_2023.rds'
download.file(url, tempfile_loc)

sc2023 <- read_rds(tempfile_loc)

dim(sc2023)
## [1] 717945    118
sc2023 |> 
  head() |> 
  glimpse()
## Rows: 6
## Columns: 118
## $ pitch_type                               <chr> "FF", "FF", "CU", "FF", "SI",…
## $ game_date                                <date> 2023-03-30, 2023-03-30, 2023…
## $ release_speed                            <dbl> 96.3, 96.7, 77.5, 97.1, 94.1,…
## $ release_pos_x                            <dbl> -2.06, -2.02, -1.80, -2.10, -…
## $ release_pos_z                            <dbl> 6.03, 5.94, 6.24, 5.91, 5.99,…
## $ player_name                              <chr> "Vosler, Jason", "Vosler, Jas…
## $ batter                                   <dbl> 613564, 613564, 613564, 66388…
## $ pitcher                                  <dbl> 656605, 656605, 656605, 65660…
## $ events                                   <chr> "triple", NA, NA, "strikeout"…
## $ description                              <chr> "hit_into_play", "foul", "fou…
## $ spin_dir                                 <lgl> NA, NA, NA, NA, NA, NA
## $ spin_rate_deprecated                     <lgl> NA, NA, NA, NA, NA, NA
## $ break_angle_deprecated                   <lgl> NA, NA, NA, NA, NA, NA
## $ break_length_deprecated                  <lgl> NA, NA, NA, NA, NA, NA
## $ zone                                     <dbl> 12, 14, 5, 5, 11, 11
## $ des                                      <chr> "Jason Vosler triples (1) on …
## $ game_type                                <chr> "R", "R", "R", "R", "R", "R"
## $ stand                                    <chr> "L", "L", "L", "R", "R", "L"
## $ p_throws                                 <chr> "R", "R", "R", "R", "R", "R"
## $ home_team                                <chr> "CIN", "CIN", "CIN", "CIN", "…
## $ away_team                                <chr> "PIT", "PIT", "PIT", "PIT", "…
## $ type                                     <chr> "X", "S", "S", "S", "S", "X"
## $ hit_location                             <dbl> 9, NA, NA, 2, NA, 8
## $ bb_type                                  <chr> "line_drive", NA, NA, NA, NA,…
## $ balls                                    <dbl> 0, 0, 0, 1, 1, 1
## $ strikes                                  <dbl> 2, 1, 0, 2, 1, 2
## $ game_year                                <dbl> 2023, 2023, 2023, 2023, 2023,…
## $ pfx_x                                    <dbl> -0.78, -0.75, 0.92, -0.77, -1…
## $ pfx_z                                    <dbl> 1.36, 1.36, -1.22, 1.22, 0.69…
## $ plate_x                                  <dbl> 0.91, 1.13, 0.14, 0.12, -1.14…
## $ plate_z                                  <dbl> 2.55, 2.27, 2.50, 2.10, 3.30,…
## $ on_3b                                    <dbl> 663697, 663697, 663697, 66369…
## $ on_2b                                    <dbl> NA, NA, NA, NA, NA, NA
## $ on_1b                                    <dbl> 641584, 641584, 641584, 64158…
## $ outs_when_up                             <dbl> 2, 2, 2, 1, 1, 2
## $ inning                                   <dbl> 5, 5, 5, 5, 5, 6
## $ inning_topbot                            <chr> "Bot", "Bot", "Bot", "Bot", "…
## $ hc_x                                     <dbl> 215, NA, NA, NA, NA, 165
## $ hc_y                                     <dbl> 107, NA, NA, NA, NA, 105
## $ tfs_deprecated                           <lgl> NA, NA, NA, NA, NA, NA
## $ tfs_zulu_deprecated                      <lgl> NA, NA, NA, NA, NA, NA
## $ umpire                                   <lgl> NA, NA, NA, NA, NA, NA
## $ sv_id                                    <lgl> NA, NA, NA, NA, NA, NA
## $ vx0                                      <dbl> 9.61, 10.08, 2.46, 7.71, 5.75…
## $ vy0                                      <dbl> -140, -140, -113, -141, -137,…
## $ vz0                                      <dbl> -6.82, -7.26, 1.29, -7.44, -2…
## $ ax                                       <dbl> -12.17, -12.27, 7.25, -12.11,…
## $ ay                                       <dbl> 27.2, 34.6, 23.7, 32.9, 26.1,…
## $ az                                       <dbl> -12.9, -12.7, -42.9, -14.4, -…
## $ sz_top                                   <dbl> 3.34, 3.34, 3.34, 3.15, 3.30,…
## $ sz_bot                                   <dbl> 1.61, 1.61, 1.61, 1.56, 1.51,…
## $ hit_distance_sc                          <dbl> 134, 162, NA, NA, 171, 9
## $ launch_speed                             <dbl> 94.2, 70.4, NA, NA, 66.5, 93.7
## $ launch_angle                             <dbl> 9, 18, NA, NA, 22, -19
## $ effective_speed                          <dbl> 96.7, 96.1, 77.1, 97.0, 94.8,…
## $ release_spin_rate                        <dbl> 2308, 2472, 2659, 2435, 2287,…
## $ release_extension                        <dbl> 6.3, 6.4, 6.4, 6.4, 6.3, 6.0
## $ game_pk                                  <dbl> 718773, 718773, 718773, 71877…
## $ fielder_2                                <dbl> 595978, 595978, 595978, 59597…
## $ fielder_3                                <dbl> 467793, 467793, 467793, 46779…
## $ fielder_4                                <dbl> 678225, 678225, 678225, 67822…
## $ fielder_5                                <dbl> 663647, 663647, 663647, 66364…
## $ fielder_6                                <dbl> 665833, 665833, 665833, 66583…
## $ fielder_7                                <dbl> 668804, 668804, 668804, 66880…
## $ fielder_8                                <dbl> 669261, 669261, 669261, 66926…
## $ fielder_9                                <dbl> 675986, 675986, 675986, 67598…
## $ release_pos_y                            <dbl> 54.2, 54.1, 54.1, 54.1, 54.2,…
## $ estimated_ba_using_speedangle            <dbl> 0.660, NA, NA, NA, NA, 0.146
## $ estimated_woba_using_speedangle          <dbl> 0.622, NA, NA, 0.000, NA, 0.1…
## $ woba_value                               <dbl> 1.6, NA, NA, 0.0, NA, 0.9
## $ woba_denom                               <dbl> 1, NA, NA, 1, NA, 1
## $ babip_value                              <dbl> 1, NA, NA, 0, NA, 1
## $ iso_value                                <dbl> 2, NA, NA, 0, NA, 0
## $ launch_speed_angle                       <dbl> 4, NA, NA, NA, NA, 2
## $ at_bat_number                            <dbl> 49, 49, 49, 48, 48, 39
## $ pitch_number                             <dbl> 3, 2, 1, 4, 3, 4
## $ pitch_name                               <chr> "4-Seam Fastball", "4-Seam Fa…
## $ home_score                               <dbl> 2, 2, 2, 2, 2, 0
## $ away_score                               <dbl> 4, 4, 4, 4, 4, 2
## $ bat_score                                <dbl> 2, 2, 2, 2, 2, 2
## $ fld_score                                <dbl> 4, 4, 4, 4, 4, 0
## $ post_away_score                          <dbl> 4, 4, 4, 4, 4, 3
## $ post_home_score                          <dbl> 4, 2, 2, 2, 2, 0
## $ post_bat_score                           <dbl> 4, 2, 2, 2, 2, 3
## $ post_fld_score                           <dbl> 4, 4, 4, 4, 4, 0
## $ if_fielding_alignment                    <chr> "Standard", "Standard", "Stan…
## $ of_fielding_alignment                    <chr> "Standard", "Standard", "Stan…
## $ spin_axis                                <dbl> 222, 220, 35, 224, 223, 235
## $ delta_home_win_exp                       <dbl> 0.276, 0.000, 0.000, -0.088, …
## $ delta_run_exp                            <dbl> 1.189, -0.054, -0.052, -0.248…
## $ bat_speed                                <dbl> NA, NA, NA, NA, NA, NA
## $ swing_length                             <dbl> NA, NA, NA, NA, NA, NA
## $ estimated_slg_using_speedangle           <dbl> 0.754, NA, NA, NA, NA, 0.178
## $ delta_pitcher_run_exp                    <dbl> -1.189, 0.054, 0.052, 0.248, …
## $ hyper_speed                              <dbl> 94.2, 88.0, NA, NA, 88.0, 93.7
## $ home_score_diff                          <dbl> -2, -2, -2, -2, -2, -2
## $ bat_score_diff                           <dbl> -2, -2, -2, -2, -2, 2
## $ home_win_exp                             <dbl> 0.277, 0.277, 0.277, 0.365, 0…
## $ bat_win_exp                              <dbl> 0.277, 0.277, 0.277, 0.365, 0…
## $ age_pit_legacy                           <dbl> 27, 27, 27, 27, 27, 27
## $ age_bat_legacy                           <dbl> 29, 29, 29, 26, 26, 31
## $ age_pit                                  <dbl> 27, 27, 27, 27, 27, 28
## $ age_bat                                  <dbl> 30, 30, 30, 27, 27, 31
## $ n_thruorder_pitcher                      <dbl> 3, 3, 3, 3, 3, 3
## $ n_priorpa_thisgame_player_at_bat         <dbl> 2, 2, 2, 2, 2, 2
## $ pitcher_days_since_prev_game             <dbl> NA, NA, NA, NA, NA, NA
## $ batter_days_since_prev_game              <dbl> NA, NA, NA, NA, NA, NA
## $ pitcher_days_until_next_game             <dbl> 6, 6, 6, 6, 6, 5
## $ batter_days_until_next_game              <dbl> 3, 3, 3, 2, 2, 1
## $ api_break_z_with_gravity                 <dbl> 1.07, 1.11, 5.07, 1.21, 1.85,…
## $ api_break_x_arm                          <dbl> 0.78, 0.75, -0.92, 0.77, 1.50…
## $ api_break_x_batter_in                    <dbl> -0.78, -0.75, 0.92, 0.77, 1.5…
## $ arm_angle                                <dbl> 38.8, 39.9, 49.1, 34.6, 39.4,…
## $ attack_angle                             <dbl> NA, NA, NA, NA, NA, NA
## $ attack_direction                         <dbl> NA, NA, NA, NA, NA, NA
## $ swing_path_tilt                          <dbl> NA, NA, NA, NA, NA, NA
## $ intercept_ball_minus_batter_pos_x_inches <dbl> NA, NA, NA, NA, NA, NA
## $ intercept_ball_minus_batter_pos_y_inches <dbl> NA, NA, NA, NA, NA, NA
sc2023 |>
     group_by(game_type) |>
     summarize(
          num_games = n_distinct(game_pk),
          num_pitches = n(),
          num_hr = sum(events == "home_run", na.rm = TRUE)
     )
## # A tibble: 1 × 4
##   game_type num_games num_pitches num_hr
##   <chr>         <int>       <int>  <int>
## 1 R              2430      717945   5868