12.2 Acquiring a Year’s Worth of Statcast Data
Let’s say that we want to retrieve the full 2023 season data from Statcast.
# getting 2023 season statcast data
# data_dir <- "./data"
# statcast_dir <- path(data_dir, "sc_2023")
# if (!dir.exists(statcast_dir)) {
# dir.create(statcast_dir)
# }
#
# statcast_season(year = 2023, dir = statcast_dir)
#
# sc2023 <- statcast_dir |>
# statcast_read_csv(pattern = "sc_2023.+\\.csv")
Do the same process for the 2021 and 2022 season, changing the corresponding year.
Now, let’s verify the validity of the 2023 season data.
tempfile_loc <- tempfile()
url <- 'https://statcast-data.atl1.digitaloceanspaces.com/statcast_2023.rds'
download.file(url, tempfile_loc)
sc2023 <- read_rds(tempfile_loc)
dim(sc2023)
## [1] 717945 118
## Rows: 6
## Columns: 118
## $ pitch_type <chr> "FF", "FF", "CU", "FF", "SI",…
## $ game_date <date> 2023-03-30, 2023-03-30, 2023…
## $ release_speed <dbl> 96.3, 96.7, 77.5, 97.1, 94.1,…
## $ release_pos_x <dbl> -2.06, -2.02, -1.80, -2.10, -…
## $ release_pos_z <dbl> 6.03, 5.94, 6.24, 5.91, 5.99,…
## $ player_name <chr> "Vosler, Jason", "Vosler, Jas…
## $ batter <dbl> 613564, 613564, 613564, 66388…
## $ pitcher <dbl> 656605, 656605, 656605, 65660…
## $ events <chr> "triple", NA, NA, "strikeout"…
## $ description <chr> "hit_into_play", "foul", "fou…
## $ spin_dir <lgl> NA, NA, NA, NA, NA, NA
## $ spin_rate_deprecated <lgl> NA, NA, NA, NA, NA, NA
## $ break_angle_deprecated <lgl> NA, NA, NA, NA, NA, NA
## $ break_length_deprecated <lgl> NA, NA, NA, NA, NA, NA
## $ zone <dbl> 12, 14, 5, 5, 11, 11
## $ des <chr> "Jason Vosler triples (1) on …
## $ game_type <chr> "R", "R", "R", "R", "R", "R"
## $ stand <chr> "L", "L", "L", "R", "R", "L"
## $ p_throws <chr> "R", "R", "R", "R", "R", "R"
## $ home_team <chr> "CIN", "CIN", "CIN", "CIN", "…
## $ away_team <chr> "PIT", "PIT", "PIT", "PIT", "…
## $ type <chr> "X", "S", "S", "S", "S", "X"
## $ hit_location <dbl> 9, NA, NA, 2, NA, 8
## $ bb_type <chr> "line_drive", NA, NA, NA, NA,…
## $ balls <dbl> 0, 0, 0, 1, 1, 1
## $ strikes <dbl> 2, 1, 0, 2, 1, 2
## $ game_year <dbl> 2023, 2023, 2023, 2023, 2023,…
## $ pfx_x <dbl> -0.78, -0.75, 0.92, -0.77, -1…
## $ pfx_z <dbl> 1.36, 1.36, -1.22, 1.22, 0.69…
## $ plate_x <dbl> 0.91, 1.13, 0.14, 0.12, -1.14…
## $ plate_z <dbl> 2.55, 2.27, 2.50, 2.10, 3.30,…
## $ on_3b <dbl> 663697, 663697, 663697, 66369…
## $ on_2b <dbl> NA, NA, NA, NA, NA, NA
## $ on_1b <dbl> 641584, 641584, 641584, 64158…
## $ outs_when_up <dbl> 2, 2, 2, 1, 1, 2
## $ inning <dbl> 5, 5, 5, 5, 5, 6
## $ inning_topbot <chr> "Bot", "Bot", "Bot", "Bot", "…
## $ hc_x <dbl> 215, NA, NA, NA, NA, 165
## $ hc_y <dbl> 107, NA, NA, NA, NA, 105
## $ tfs_deprecated <lgl> NA, NA, NA, NA, NA, NA
## $ tfs_zulu_deprecated <lgl> NA, NA, NA, NA, NA, NA
## $ umpire <lgl> NA, NA, NA, NA, NA, NA
## $ sv_id <lgl> NA, NA, NA, NA, NA, NA
## $ vx0 <dbl> 9.61, 10.08, 2.46, 7.71, 5.75…
## $ vy0 <dbl> -140, -140, -113, -141, -137,…
## $ vz0 <dbl> -6.82, -7.26, 1.29, -7.44, -2…
## $ ax <dbl> -12.17, -12.27, 7.25, -12.11,…
## $ ay <dbl> 27.2, 34.6, 23.7, 32.9, 26.1,…
## $ az <dbl> -12.9, -12.7, -42.9, -14.4, -…
## $ sz_top <dbl> 3.34, 3.34, 3.34, 3.15, 3.30,…
## $ sz_bot <dbl> 1.61, 1.61, 1.61, 1.56, 1.51,…
## $ hit_distance_sc <dbl> 134, 162, NA, NA, 171, 9
## $ launch_speed <dbl> 94.2, 70.4, NA, NA, 66.5, 93.7
## $ launch_angle <dbl> 9, 18, NA, NA, 22, -19
## $ effective_speed <dbl> 96.7, 96.1, 77.1, 97.0, 94.8,…
## $ release_spin_rate <dbl> 2308, 2472, 2659, 2435, 2287,…
## $ release_extension <dbl> 6.3, 6.4, 6.4, 6.4, 6.3, 6.0
## $ game_pk <dbl> 718773, 718773, 718773, 71877…
## $ fielder_2 <dbl> 595978, 595978, 595978, 59597…
## $ fielder_3 <dbl> 467793, 467793, 467793, 46779…
## $ fielder_4 <dbl> 678225, 678225, 678225, 67822…
## $ fielder_5 <dbl> 663647, 663647, 663647, 66364…
## $ fielder_6 <dbl> 665833, 665833, 665833, 66583…
## $ fielder_7 <dbl> 668804, 668804, 668804, 66880…
## $ fielder_8 <dbl> 669261, 669261, 669261, 66926…
## $ fielder_9 <dbl> 675986, 675986, 675986, 67598…
## $ release_pos_y <dbl> 54.2, 54.1, 54.1, 54.1, 54.2,…
## $ estimated_ba_using_speedangle <dbl> 0.660, NA, NA, NA, NA, 0.146
## $ estimated_woba_using_speedangle <dbl> 0.622, NA, NA, 0.000, NA, 0.1…
## $ woba_value <dbl> 1.6, NA, NA, 0.0, NA, 0.9
## $ woba_denom <dbl> 1, NA, NA, 1, NA, 1
## $ babip_value <dbl> 1, NA, NA, 0, NA, 1
## $ iso_value <dbl> 2, NA, NA, 0, NA, 0
## $ launch_speed_angle <dbl> 4, NA, NA, NA, NA, 2
## $ at_bat_number <dbl> 49, 49, 49, 48, 48, 39
## $ pitch_number <dbl> 3, 2, 1, 4, 3, 4
## $ pitch_name <chr> "4-Seam Fastball", "4-Seam Fa…
## $ home_score <dbl> 2, 2, 2, 2, 2, 0
## $ away_score <dbl> 4, 4, 4, 4, 4, 2
## $ bat_score <dbl> 2, 2, 2, 2, 2, 2
## $ fld_score <dbl> 4, 4, 4, 4, 4, 0
## $ post_away_score <dbl> 4, 4, 4, 4, 4, 3
## $ post_home_score <dbl> 4, 2, 2, 2, 2, 0
## $ post_bat_score <dbl> 4, 2, 2, 2, 2, 3
## $ post_fld_score <dbl> 4, 4, 4, 4, 4, 0
## $ if_fielding_alignment <chr> "Standard", "Standard", "Stan…
## $ of_fielding_alignment <chr> "Standard", "Standard", "Stan…
## $ spin_axis <dbl> 222, 220, 35, 224, 223, 235
## $ delta_home_win_exp <dbl> 0.276, 0.000, 0.000, -0.088, …
## $ delta_run_exp <dbl> 1.189, -0.054, -0.052, -0.248…
## $ bat_speed <dbl> NA, NA, NA, NA, NA, NA
## $ swing_length <dbl> NA, NA, NA, NA, NA, NA
## $ estimated_slg_using_speedangle <dbl> 0.754, NA, NA, NA, NA, 0.178
## $ delta_pitcher_run_exp <dbl> -1.189, 0.054, 0.052, 0.248, …
## $ hyper_speed <dbl> 94.2, 88.0, NA, NA, 88.0, 93.7
## $ home_score_diff <dbl> -2, -2, -2, -2, -2, -2
## $ bat_score_diff <dbl> -2, -2, -2, -2, -2, 2
## $ home_win_exp <dbl> 0.277, 0.277, 0.277, 0.365, 0…
## $ bat_win_exp <dbl> 0.277, 0.277, 0.277, 0.365, 0…
## $ age_pit_legacy <dbl> 27, 27, 27, 27, 27, 27
## $ age_bat_legacy <dbl> 29, 29, 29, 26, 26, 31
## $ age_pit <dbl> 27, 27, 27, 27, 27, 28
## $ age_bat <dbl> 30, 30, 30, 27, 27, 31
## $ n_thruorder_pitcher <dbl> 3, 3, 3, 3, 3, 3
## $ n_priorpa_thisgame_player_at_bat <dbl> 2, 2, 2, 2, 2, 2
## $ pitcher_days_since_prev_game <dbl> NA, NA, NA, NA, NA, NA
## $ batter_days_since_prev_game <dbl> NA, NA, NA, NA, NA, NA
## $ pitcher_days_until_next_game <dbl> 6, 6, 6, 6, 6, 5
## $ batter_days_until_next_game <dbl> 3, 3, 3, 2, 2, 1
## $ api_break_z_with_gravity <dbl> 1.07, 1.11, 5.07, 1.21, 1.85,…
## $ api_break_x_arm <dbl> 0.78, 0.75, -0.92, 0.77, 1.50…
## $ api_break_x_batter_in <dbl> -0.78, -0.75, 0.92, 0.77, 1.5…
## $ arm_angle <dbl> 38.8, 39.9, 49.1, 34.6, 39.4,…
## $ attack_angle <dbl> NA, NA, NA, NA, NA, NA
## $ attack_direction <dbl> NA, NA, NA, NA, NA, NA
## $ swing_path_tilt <dbl> NA, NA, NA, NA, NA, NA
## $ intercept_ball_minus_batter_pos_x_inches <dbl> NA, NA, NA, NA, NA, NA
## $ intercept_ball_minus_batter_pos_y_inches <dbl> NA, NA, NA, NA, NA, NA
sc2023 |>
group_by(game_type) |>
summarize(
num_games = n_distinct(game_pk),
num_pitches = n(),
num_hr = sum(events == "home_run", na.rm = TRUE)
)
## # A tibble: 1 × 4
## game_type num_games num_pitches num_hr
## <chr> <int> <int> <int>
## 1 R 2430 717945 5868