Setting up the data

  • Only players with more then 2000 AB:
batting_2000 <- batting |> 
  group_by(playerID) |>
  summarize(AB_career = sum(AB, na.rm = TRUE)) |>
  inner_join(batting, by = "playerID") |>
  filter(AB_career >= 2000)
  • Add the most played position:
Positions <- Fielding |> 
  group_by(playerID, POS) |>
  summarize(Games = sum(G)) |> 
  arrange(playerID, desc(Games)) |> 
  filter(POS == first(POS))
## `summarise()` has grouped output by 'playerID'. You can override using the
## `.groups` argument.
batting_2000 <- batting_2000 |>
  inner_join(Positions, by = "playerID")
  • Add career statistics
my_vars <- c("G", "AB", "R", "H", "X2B", "X3B",
             "HR", "RBI", "BB", "SO", "SB")

C_totals <- batting_2000 |>
  group_by(playerID) |>
  summarize(across(all_of(my_vars), ~ sum(.x, na.rm = TRUE))) |>
  mutate(
    AVG = H / AB,
    SLG = (H - X2B - X3B - HR + 2 * X2B + 3 * X3B + 4 * HR) / AB
  ) |>
  inner_join(Positions, by = "playerID") |>
  mutate(
    Value_POS = case_when(
      POS == "C" ~ 240,
      POS == "SS" ~ 168,
      POS == "2B" ~ 132,
      POS == "3B" ~ 84,
      POS == "OF" ~ 48,
      POS == "1B" ~ 12, 
      TRUE ~ 0
    )
  )
  • Value_POS are numbers introduced for use in the similarity score by Bill James for positions.