Read in data
sc_two_seasons <- here::here("data/sc_bip_2021_2023.parquet") |>
read_parquet() |>
mutate(
Season = year(game_date),
HR = ifelse(events == "home_run", 1, 0)
)
sc_2023 <- sc_two_seasons |>
filter(Season == 2023)
Rows: 124,234
Columns: 16
$ game_pk <dbl> 718773, 718774, 718773, 718778, 718781, 718778, 718772, 718770, 718776, 718778, 718…
$ game_date <date> 2023-03-30, 2023-03-30, 2023-03-30, 2023-03-30, 2023-03-30, 2023-03-30, 2023-03-30…
$ batter <dbl> 613564, 643446, 641584, 453568, 527038, 592178, 665489, 502110, 670623, 602074, 663…
$ pitcher <dbl> 656605, 645261, 656605, 605483, 543037, 605483, 571945, 668678, 593958, 605483, 669…
$ events <chr> "triple", "single", "single", "single", "grounded_into_double_play", "field_out", "…
$ stand <chr> "L", "L", "L", "L", "R", "R", "R", "R", "R", "R", "L", "L", "R", "L", "L", "R", "R"…
$ p_throws <chr> "R", "R", "R", "L", "R", "L", "R", "R", "L", "L", "R", "R", "L", "R", "R", "R", "R"…
$ hit_distance_sc <dbl> 134, 9, 254, 162, 51, 56, 42, 185, 143, 171, 240, 374, 356, 65, 392, 116, 422, 188,…
$ hc_x <dbl> 215.08, 164.78, 196.95, 90.66, 110.24, 153.95, 184.78, 177.78, 116.88, 177.33, 73.8…
$ hc_y <dbl> 107.23, 105.10, 95.17, 133.88, 148.44, 209.01, 100.21, 106.58, 74.73, 114.14, 117.8…
$ launch_speed <dbl> 94.2, 93.7, 111.7, 59.1, 94.8, 69.5, 115.5, 102.7, 105.7, 93.8, 93.4, 96.4, 93.5, 8…
$ launch_angle <dbl> 9, -19, 13, 27, 1, 81, -2, 9, 7, 9, 55, 28, 25, 76, 35, 6, 26, 12, 9, 32, -2, 1, -3…
$ home_team <chr> "CIN", "MIA", "CIN", "SD", "NYY", "SD", "STL", "LAD", "TB", "SD", "SEA", "MIA", "TB…
$ away_team <chr> "PIT", "NYM", "PIT", "COL", "SF", "COL", "TOR", "AZ", "DET", "COL", "CLE", "NYM", "…
$ Season <dbl> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023,…
$ HR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
- Parquet format allows us to include the data in the repo! (3.4 MB)