4.9 Exercises

4.9.1 Exercise 4.1

Relationship Between Winning Percentage and Run Differential Across Decades

Section 4.3 used a simple linear model to predict a team’s winning percentage based on its run differential. This model was fit using team data since the 2001 season.

Refit this linear model using data from the seasons 1961–1970, the seasons 1971–1980, the seasons 1981–1990, and the seasons 1991–2000.
Compare across the five decades the predicted winning percentage for a team with a run differential of 10 runs.

decade	beta_0	beta_1	RD_10
Run Differential: +10
over the decades
1910 to 1919	0.49996	0.00076	0.50754
1920 to 1929	0.49999	0.00063	0.50629
1930 to 1939	0.49988	0.00062	0.50605
1940 to 1949	0.49985	0.00066	0.50643
1950 to 1959	0.49997	0.00064	0.50632
1960 to 1969	0.49993	0.00071	0.50704
1970 to 1979	0.50000	0.00064	0.50639
1980 to 1989	0.49993	0.00071	0.50701
1990 to 1999	0.50000	0.00063	0.50632
2000 to 2009	0.49999	0.00062	0.50621
2010 to 2019	0.49999	0.00061	0.50607
2020 to 2023	0.49998	0.00063	0.50624

table code

ch4_1_data <- Teams |>
  select(yearID, teamID, W, L, R, RA) |>
  mutate(RD = R - RA) |>
  mutate(Wpct = W / (W + L))

fit_1910s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1910 & yearID < 1920))
fit_1920s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1920 & yearID < 1930))
fit_1930s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1930 & yearID < 1940))
fit_1940s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1940 & yearID < 1950))
fit_1950s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1950 & yearID < 1960))
fit_1960s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1960 & yearID < 1970))
fit_1970s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1970 & yearID < 1980))
fit_1980s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1980 & yearID < 1990))
fit_1990s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 1990 & yearID < 2000))
fit_2000s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 2000 & yearID < 2010))
fit_2010s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 2010 & yearID < 2020))
fit_2020s <- lm(Wpct ~ RD, data = ch4_1_data |> filter(yearID >= 2020))

pred_1910s_R10 <- predict(fit_1910s, newdata = data.frame(RD = 10))
pred_1920s_R10 <- predict(fit_1920s, newdata = data.frame(RD = 10))
pred_1930s_R10 <- predict(fit_1930s, newdata = data.frame(RD = 10))
pred_1940s_R10 <- predict(fit_1940s, newdata = data.frame(RD = 10))
pred_1950s_R10 <- predict(fit_1950s, newdata = data.frame(RD = 10))
pred_1960s_R10 <- predict(fit_1960s, newdata = data.frame(RD = 10))
pred_1970s_R10 <- predict(fit_1970s, newdata = data.frame(RD = 10))
pred_1980s_R10 <- predict(fit_1980s, newdata = data.frame(RD = 10))
pred_1990s_R10 <- predict(fit_1990s, newdata = data.frame(RD = 10))
pred_2000s_R10 <- predict(fit_2000s, newdata = data.frame(RD = 10))
pred_2010s_R10 <- predict(fit_2010s, newdata = data.frame(RD = 10))
pred_2020s_R10 <- predict(fit_2020s, newdata = data.frame(RD = 10))

starts <- seq(1910, 2020, by = 10)
ends   <- c(seq(1919, 2019, by = 10), 2023)
decade <- paste(starts, "to", ends)
beta_0 <- c(
  fit_1910s$coefficients[1],
  fit_1920s$coefficients[1],
  fit_1930s$coefficients[1],
  fit_1940s$coefficients[1],
  fit_1950s$coefficients[1],
  fit_1960s$coefficients[1],
  fit_1970s$coefficients[1],
  fit_1980s$coefficients[1],
  fit_1990s$coefficients[1],
  fit_2000s$coefficients[1],
  fit_2010s$coefficients[1],
  fit_2020s$coefficients[1])
beta_1 <- c(
  fit_1910s$coefficients[2],
  fit_1920s$coefficients[2],
  fit_1930s$coefficients[2],
  fit_1940s$coefficients[2],
  fit_1950s$coefficients[2],
  fit_1960s$coefficients[2],
  fit_1970s$coefficients[2],
  fit_1980s$coefficients[2],
  fit_1990s$coefficients[2],
  fit_2000s$coefficients[2],
  fit_2010s$coefficients[2],
  fit_2020s$coefficients[2])
RD_10 <- c(
  pred_1910s_R10,
  pred_1920s_R10,
  pred_1930s_R10,
  pred_1940s_R10,
  pred_1950s_R10,
  pred_1960s_R10,
  pred_1970s_R10,
  pred_1980s_R10,
  pred_1990s_R10,
  pred_2000s_R10,
  pred_2010s_R10,
  pred_2020s_R10)

df_4_1 <- data.frame(decade, beta_0, beta_1, RD_10)

df_4_1 |>
  gt() |>
  cols_align(align = "center") |>
  fmt_number(columns = -decade,
             decimals = 5) |>
  tab_header(title = "Run Differential: +10",
             subtitle = "over the decades") |>
  tab_style(
    style = list(
      cell_fill(color = "dodgerblue"),
      cell_text(weight = "bold")),
    locations = cells_body(columns = RD_10))

4.9.2 Exercise 4.2

Pythagorean Residuals for Poor and Great Teams in the 19th Century

As baseball was evolving into its modern form, 19th century leagues often featured abysmal teams that did not even succeed in finishing their season, as well as some dominant clubs.

Fit a Pythagorean formula model to the run differential, win-loss data for teams who played in the 19th century.
By inspecting the residual plot of your fitted model from (a), did the great and poor teams in the 19th century do better or worse than one would expect on the basis of their run differentials?

\[\widehat{W_{\text{pct}}} = \frac{R^{1.931}}{R^{1.931} + {RA^{1.931}}}\]

R code

ch4_2_data <- Teams |>
  filter(yearID < 1900) |>
  select(yearID, franchID, W, L, R, RA) |>
  mutate(RD = R - RA) |>
  mutate(Wpct = W / (W + L)) |>
  mutate(logWratio = log(W/L),
         logRratio = log(R/RA))

# handle winless teams!
ch4_2_data$logWratio[ch4_2_data$W == 0] <- -3.1415

pyt_fit <- lm(logWratio ~ 0 + logRratio, data = ch4_2_data)
pyt_fit$coefficients

## logRratio 
##  1.931166

ch4_2_data <- ch4_2_data |>
  mutate(Wpct_pyt = R^1.931 / (R^1.931 + RA^1.931),
         resid_pyt = Wpct - Wpct_pyt)

yearID	franchName	W	L	R	RA	Wpct	Wpct_pyt	resid_pyt	desc
1800s Baseball
Best Teams
1875	Boston Red Stockings	71	8	831	343	0.8987	0.8467	0.0521	lucky
1884	St. Louis Maroons	94	19	887	429	0.8319	0.8026	0.0293	lucky
1872	Boston Red Stockings	39	8	521	236	0.8298	0.8219	0.0079	lucky
1880	Chicago Cubs	67	17	538	317	0.7976	0.7352	0.0624	lucky
1876	Chicago Cubs	52	14	624	257	0.7879	0.8472	−0.0593	unlucky
1885	Chicago Cubs	87	25	834	470	0.7768	0.7516	0.0251	lucky
1885	San Francisco Giants	85	27	691	370	0.7589	0.7696	−0.0107	unlucky
1871	Philadelphia Athletics	21	7	376	266	0.7500	0.6611	0.0889	lucky
1884	Providence Grays	84	28	665	388	0.7500	0.7389	0.0111	lucky
1874	Boston Red Stockings	52	18	735	415	0.7429	0.7510	−0.0081	unlucky

yearID	franchName	W	L	R	RA	Wpct	Wpct_pyt	resid_pyt	desc
1800s Baseball
Worst Teams
1872	Washington Nationals	0	11	80	190	0.0000	0.1584	−0.1584	unlucky
1873	Baltimore Marylands	0	6	26	152	0.0000	0.0320	−0.0320	unlucky
1875	Brooklyn Atlantics	2	42	132	438	0.0455	0.0898	−0.0443	unlucky
1875	Keokuk Westerns	1	12	45	88	0.0769	0.2150	−0.1381	unlucky
1873	Elizabeth Resolutes	2	21	98	299	0.0870	0.1040	−0.0170	unlucky
1872	Brooklyn Eckfords	3	26	152	413	0.1034	0.1267	−0.0233	unlucky
1884	Wilmington Quicksteps	2	16	35	114	0.1111	0.0928	0.0183	lucky
1899	Cleveland Spiders	20	134	529	1252	0.1299	0.1593	−0.0294	unlucky
1876	Cincinnati Reds	9	56	238	579	0.1385	0.1523	−0.0138	unlucky
1875	Philadelphia Centennials	2	12	70	138	0.1429	0.2124	−0.0695	unlucky

code for tables

ch4_2_data |>
  select(yearID, franchID, W, L, R, RA, Wpct, Wpct_pyt, resid_pyt) |>
  slice_max(n = 10, order_by = Wpct) |>
  inner_join(TeamsFranchises, by = "franchID") |>
  select(yearID, franchName, W, L, R, RA, Wpct, Wpct_pyt, resid_pyt) |>
  mutate(desc = ifelse(resid_pyt > 0, "lucky", "unlucky")) |>
  gt() |>
  cols_align(align = "center") |>
  data_color(columns = desc,
             palette = "viridis",
             reverse = TRUE) |>
  fmt_number(columns = c(Wpct, Wpct_pyt, resid_pyt),
             decimals = 4) |>
  tab_header(title = "1800s Baseball",
             subtitle = "Best Teams") |>
  tab_style(
    style = list(
      cell_fill(color = "dodgerblue"),
      cell_text(weight = "bold")),
    locations = cells_body(columns = Wpct))

ch4_2_data |>
  select(yearID, franchID, W, L, R, RA, Wpct, Wpct_pyt, resid_pyt) |>
  slice_min(n = 10, order_by = Wpct) |>
  inner_join(TeamsFranchises, by = "franchID") |>
  select(yearID, franchName, W, L, R, RA, Wpct, Wpct_pyt, resid_pyt) |>
  mutate(desc = ifelse(resid_pyt > 0, "lucky", "unlucky")) |>
  gt() |>
  cols_align(align = "center") |>
  data_color(columns = desc,
             palette = "viridis",
             reverse = TRUE) |>
  fmt_number(columns = c(Wpct, Wpct_pyt, resid_pyt),
             decimals = 4) |>
  tab_header(title = "1800s Baseball",
             subtitle = "Worst Teams") |>
  tab_style(
    style = list(
      cell_fill(color = "dodgerblue"),
      cell_text(weight = "bold")),
    locations = cells_body(columns = Wpct))