2.9 Splitting, Applying, and Combining Data
library(Lahman)
Batting |>
dplyr::filter(yearID >= 1960, yearID <= 1969) |>
dplyr::group_by(playerID) |>
dplyr::summarize(HR = sum(HR)) |>
dplyr::arrange(desc(HR)) |>
dplyr::slice(1:4)
## # A tibble: 4 × 2
## playerID HR
## <chr> <int>
## 1 killeha01 393
## 2 aaronha01 375
## 3 mayswi01 350
## 4 robinfr02 316
What if we want to find the top HR hitters for each decade?
hr_leader <- function(data) {
data |>
dplyr::group_by(playerID) |>
dplyr::summarize(HR = sum(HR)) |>
dplyr::arrange(desc(HR)) |>
dplyr::slice(1)
}
Do you see any potential issues with this function?
Batting_decade <- Batting |>
dplyr::mutate(decade = 10 * floor(yearID / 10)) |>
dplyr::group_by(decade)
decades <- Batting_decade |>
dplyr::group_keys() |>
dplyr::pull("decade")
decades
## [1] 1870 1880 1890 1900 1910 1920 1930 1940 1950 1960 1970 1980 1990 2000 2010
## [16] 2020
Batting_decade |>
dplyr::group_split() |>
purrr::map(hr_leader) |>
purrr::set_names(decades) |>
dplyr::bind_rows(.id = "decade")
## # A tibble: 16 × 3
## decade playerID HR
## <chr> <chr> <int>
## 1 1870 pikeli01 21
## 2 1880 stoveha01 89
## 3 1890 duffyhu01 83
## 4 1900 davisha01 67
## 5 1910 cravaga01 116
## 6 1920 ruthba01 467
## 7 1930 foxxji01 415
## 8 1940 willite01 234
## 9 1950 snidedu01 326
## 10 1960 killeha01 393
## 11 1970 stargwi01 296
## 12 1980 schmimi01 313
## 13 1990 mcgwima01 405
## 14 2000 rodrial01 435
## 15 2010 cruzne02 346
## 16 2020 judgeaa01 147