4.4 Correlation
In short, we may want correlation values close to 1.0 or -1.0.
4.4.1 Offense
graph code
cor_value = cor(ch4_data$R, ch4_data$Wpct,
use = "pairwise.complete.obs")
subset_for_labels <- ch4_data |>
select(R, Wpct, playoff_bool, yearID, teamID) |>
group_by(playoff_bool) |>
mutate(highlight = ifelse(
R == max(R) | R == min(R),
1, 0
)) |>
ungroup() |>
filter(highlight == 1) |>
mutate(year_team = paste(yearID, teamID))
ch4_data |>
ggplot(aes(x = R, y = Wpct)) +
geom_point(aes(color = playoff_bool)) +
geom_label(aes(x = R, y = Wpct, color = playoff_bool, label = year_team),
data = subset_for_labels) +
labs(title = "Win Percentage vs Runs Scored",
subtitle = paste0("correlation: r = ", round(cor_value, 4)),
caption = "1998 to 2023 seasons",
x = "runs scored",
y = "win percentage") +
scale_color_manual(values = c("#2905A1", "#AAAAAA")) +
theme_minimal() +
theme(legend.position = "bottom",
legend.title=element_blank())
4.4.2 Defense
graph code
cor_value = cor(ch4_data$RA, ch4_data$Wpct,
use = "pairwise.complete.obs")
subset_for_labels <- ch4_data |>
select(RA, Wpct, playoff_bool, yearID, teamID) |>
group_by(playoff_bool) |>
mutate(highlight = ifelse(
RA == max(RA) | RA == min(RA),
1, 0
)) |>
ungroup() |>
filter(highlight == 1) |>
mutate(year_team = paste(yearID, teamID))
ch4_data |>
ggplot(aes(x = RA, y = Wpct)) +
geom_point(aes(color = playoff_bool)) +
geom_label(aes(x = RA, y = Wpct, color = playoff_bool, label = year_team),
data = subset_for_labels) +
labs(title = "Win Percentage vs Runs Allowed",
subtitle = paste0("correlation: r = ", round(cor_value, 4)),
caption = "1998 to 2023 seasons",
x = "runs allowed",
y = "win percentage") +
scale_color_manual(values = c("#2905A1", "#AAAAAA")) +
theme_minimal() +
theme(legend.position = "bottom",
legend.title=element_blank())
4.4.3 Run Differential
graph code
cor_value = cor(ch4_data$RD, ch4_data$Wpct,
use = "pairwise.complete.obs")
subset_for_labels <- ch4_data |>
select(RD, Wpct, playoff_bool, yearID, teamID) |>
group_by(playoff_bool) |>
mutate(highlight = ifelse(
RD == max(RD) | RD == min(RD),
1, 0
)) |>
ungroup() |>
filter(highlight == 1) |>
mutate(year_team = paste(yearID, teamID))
ch4_data |>
ggplot(aes(x = RD, y = Wpct)) +
geom_point(aes(color = playoff_bool)) +
geom_label(aes(x = RD, y = Wpct, color = playoff_bool, label = year_team),
data = subset_for_labels) +
labs(title = "Win Percentage vs Run Differential",
subtitle = paste0("correlation: r = ", round(cor_value, 4)),
caption = "1998 to 2023 seasons",
x = "run differential",
y = "win percentage") +
scale_color_manual(values = c("#2905A1", "#AAAAAA")) +
theme_minimal() +
theme(legend.position = "bottom",
legend.title=element_blank())