4.9 Exercises

4.9.1 Exercise 4.1

Relationship Between Winning Percentage and Run Differential Across Decades

Section 4.3 used a simple linear model to predict a team’s winning percentage based on its run differential. This model was fit using team data since the 2001 season.

  • Refit this linear model using data from the seasons 1961–1970, the seasons 1971–1980, the seasons 1981–1990, and the seasons 1991–2000.
  • Compare across the five decades the predicted winning percentage for a team with a run differential of 10 runs.
Run Differential: +10
over the decades
decade beta_0 beta_1 RD_10
1910 to 1919 0.49996 0.00076 0.50754
1920 to 1929 0.49999 0.00063 0.50629
1930 to 1939 0.49988 0.00062 0.50605
1940 to 1949 0.49985 0.00066 0.50643
1950 to 1959 0.49997 0.00064 0.50632
1960 to 1969 0.49993 0.00071 0.50704
1970 to 1979 0.50000 0.00064 0.50639
1980 to 1989 0.49993 0.00071 0.50701
1990 to 1999 0.50000 0.00063 0.50632
2000 to 2009 0.49999 0.00062 0.50621
2010 to 2019 0.49999 0.00061 0.50607
2020 to 2023 0.49998 0.00063 0.50624
table code
ch4_1_data <- Teams |>
  select(yearID, teamID, W, L, R, RA) |>
  mutate(RD = R - RA) |>
  mutate(Wpct = W / (W + L))

fit_1910s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1910 & yearID < 1920))
fit_1920s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1920 & yearID < 1930))
fit_1930s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1930 & yearID < 1940))
fit_1940s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1940 & yearID < 1950))
fit_1950s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1950 & yearID < 1960))
fit_1960s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1960 & yearID < 1970))
fit_1970s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1970 & yearID < 1980))
fit_1980s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1980 & yearID < 1990))
fit_1990s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1990 & yearID < 2000))
fit_2000s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 2000 & yearID < 2010))
fit_2010s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 2010 & yearID < 2020))
fit_2020s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 2020))

pred_1910s_R10 <- predict(fit_1910s, newdata = data.frame(RD = 10))
pred_1920s_R10 <- predict(fit_1920s, newdata = data.frame(RD = 10))
pred_1930s_R10 <- predict(fit_1930s, newdata = data.frame(RD = 10))
pred_1940s_R10 <- predict(fit_1940s, newdata = data.frame(RD = 10))
pred_1950s_R10 <- predict(fit_1950s, newdata = data.frame(RD = 10))
pred_1960s_R10 <- predict(fit_1960s, newdata = data.frame(RD = 10))
pred_1970s_R10 <- predict(fit_1970s, newdata = data.frame(RD = 10))
pred_1980s_R10 <- predict(fit_1980s, newdata = data.frame(RD = 10))
pred_1990s_R10 <- predict(fit_1990s, newdata = data.frame(RD = 10))
pred_2000s_R10 <- predict(fit_2000s, newdata = data.frame(RD = 10))
pred_2010s_R10 <- predict(fit_2010s, newdata = data.frame(RD = 10))
pred_2020s_R10 <- predict(fit_2020s, newdata = data.frame(RD = 10))

starts <- seq(1910, 2020, by = 10)
ends   <- c(seq(1919, 2019, by = 10), 2023)
decade <- paste(starts, "to", ends)
beta_0 <- c(
  fit_1910s$coefficients[1],
  fit_1920s$coefficients[1],
  fit_1930s$coefficients[1],
  fit_1940s$coefficients[1],
  fit_1950s$coefficients[1],
  fit_1960s$coefficients[1],
  fit_1970s$coefficients[1],
  fit_1980s$coefficients[1],
  fit_1990s$coefficients[1],
  fit_2000s$coefficients[1],
  fit_2010s$coefficients[1],
  fit_2020s$coefficients[1])
beta_1 <- c(
  fit_1910s$coefficients[2],
  fit_1920s$coefficients[2],
  fit_1930s$coefficients[2],
  fit_1940s$coefficients[2],
  fit_1950s$coefficients[2],
  fit_1960s$coefficients[2],
  fit_1970s$coefficients[2],
  fit_1980s$coefficients[2],
  fit_1990s$coefficients[2],
  fit_2000s$coefficients[2],
  fit_2010s$coefficients[2],
  fit_2020s$coefficients[2])
RD_10 <- c(
  pred_1910s_R10,
  pred_1920s_R10,
  pred_1930s_R10,
  pred_1940s_R10,
  pred_1950s_R10,
  pred_1960s_R10,
  pred_1970s_R10,
  pred_1980s_R10,
  pred_1990s_R10,
  pred_2000s_R10,
  pred_2010s_R10,
  pred_2020s_R10)

df_4_1 <- data.frame(decade, beta_0, beta_1, RD_10)

df_4_1 |>
  gt() |>
  cols_align(align = "center") |>
  fmt_number(columns = -decade,
             decimals = 5) |>
  tab_header(title = "Run Differential: +10",
             subtitle = "over the decades") |>
  tab_style(
    style = list(
      cell_fill(color = "dodgerblue"),
      cell_text(weight = "bold")),
    locations = cells_body(columns = RD_10))

4.9.2 Exercise 4.2

Pythagorean Residuals for Poor and Great Teams in the 19th Century

As baseball was evolving into its modern form, 19th century leagues often featured abysmal teams that did not even succeed in finishing their season, as well as some dominant clubs.

  • Fit a Pythagorean formula model to the run differential, win-loss data for teams who played in the 19th century.
  • By inspecting the residual plot of your fitted model from (a), did the great and poor teams in the 19th century do better or worse than one would expect on the basis of their run differentials?

\[\widehat{W_{\text{pct}}} = \frac{R^{1.931}}{R^{1.931} + {RA^{1.931}}}\]

R code
ch4_2_data <- Teams |>
  filter(yearID < 1900) |>
  select(yearID, franchID, W, L, R, RA) |>
  mutate(RD = R - RA) |>
  mutate(Wpct = W / (W + L)) |>
  mutate(logWratio = log(W/L),
         logRratio = log(R/RA))

# handle winless teams!
ch4_2_data$logWratio[ch4_2_data$W == 0] <- -3.1415

pyt_fit <- lm(logWratio ~ 0 + logRratio, data = ch4_2_data)
pyt_fit$coefficients
## logRratio 
##  1.931166
ch4_2_data <- ch4_2_data |>
  mutate(Wpct_pyt = R^1.931 / (R^1.931 + RA^1.931),
         resid_pyt = Wpct - Wpct_pyt)
1800s Baseball
Best Teams
yearID franchName W L R RA Wpct Wpct_pyt resid_pyt desc
1875 Boston Red Stockings 71 8 831 343 0.8987 0.8467 0.0521 lucky
1884 St. Louis Maroons 94 19 887 429 0.8319 0.8026 0.0293 lucky
1872 Boston Red Stockings 39 8 521 236 0.8298 0.8219 0.0079 lucky
1880 Chicago Cubs 67 17 538 317 0.7976 0.7352 0.0624 lucky
1876 Chicago Cubs 52 14 624 257 0.7879 0.8472 −0.0593 unlucky
1885 Chicago Cubs 87 25 834 470 0.7768 0.7516 0.0251 lucky
1885 San Francisco Giants 85 27 691 370 0.7589 0.7696 −0.0107 unlucky
1871 Philadelphia Athletics 21 7 376 266 0.7500 0.6611 0.0889 lucky
1884 Providence Grays 84 28 665 388 0.7500 0.7389 0.0111 lucky
1874 Boston Red Stockings 52 18 735 415 0.7429 0.7510 −0.0081 unlucky
1800s Baseball
Worst Teams
yearID franchName W L R RA Wpct Wpct_pyt resid_pyt desc
1872 Washington Nationals 0 11 80 190 0.0000 0.1584 −0.1584 unlucky
1873 Baltimore Marylands 0 6 26 152 0.0000 0.0320 −0.0320 unlucky
1875 Brooklyn Atlantics 2 42 132 438 0.0455 0.0898 −0.0443 unlucky
1875 Keokuk Westerns 1 12 45 88 0.0769 0.2150 −0.1381 unlucky
1873 Elizabeth Resolutes 2 21 98 299 0.0870 0.1040 −0.0170 unlucky
1872 Brooklyn Eckfords 3 26 152 413 0.1034 0.1267 −0.0233 unlucky
1884 Wilmington Quicksteps 2 16 35 114 0.1111 0.0928 0.0183 lucky
1899 Cleveland Spiders 20 134 529 1252 0.1299 0.1593 −0.0294 unlucky
1876 Cincinnati Reds 9 56 238 579 0.1385 0.1523 −0.0138 unlucky
1875 Philadelphia Centennials 2 12 70 138 0.1429 0.2124 −0.0695 unlucky
code for tables
ch4_2_data |>
  select(yearID, franchID, W, L, R, RA, Wpct, Wpct_pyt, resid_pyt) |>
  slice_max(n = 10, order_by = Wpct) |>
  inner_join(TeamsFranchises, by = "franchID") |>
  select(yearID, franchName, W, L, R, RA, Wpct, Wpct_pyt, resid_pyt) |>
  mutate(desc = ifelse(resid_pyt > 0, "lucky", "unlucky")) |>
  gt() |>
  cols_align(align = "center") |>
  data_color(columns = desc,
             palette = "viridis",
             reverse = TRUE) |>
  fmt_number(columns = c(Wpct, Wpct_pyt, resid_pyt),
             decimals = 4) |>
  tab_header(title = "1800s Baseball",
             subtitle = "Best Teams") |>
  tab_style(
    style = list(
      cell_fill(color = "dodgerblue"),
      cell_text(weight = "bold")),
    locations = cells_body(columns = Wpct))

ch4_2_data |>
  select(yearID, franchID, W, L, R, RA, Wpct, Wpct_pyt, resid_pyt) |>
  slice_min(n = 10, order_by = Wpct) |>
  inner_join(TeamsFranchises, by = "franchID") |>
  select(yearID, franchName, W, L, R, RA, Wpct, Wpct_pyt, resid_pyt) |>
  mutate(desc = ifelse(resid_pyt > 0, "lucky", "unlucky")) |>
  gt() |>
  cols_align(align = "center") |>
  data_color(columns = desc,
             palette = "viridis",
             reverse = TRUE) |>
  fmt_number(columns = c(Wpct, Wpct_pyt, resid_pyt),
             decimals = 4) |>
  tab_header(title = "1800s Baseball",
             subtitle = "Worst Teams") |>
  tab_style(
    style = list(
      cell_fill(color = "dodgerblue"),
      cell_text(weight = "bold")),
    locations = cells_body(columns = Wpct))