PCA Example

R setup
penguin_2_class <- penguins |>
  filter(species %in% c("Chinstrap", "Gentoo")) |>
  na.omit()

penguin_2_class |>
ggplot(aes(x = flipper_length_mm, y = bill_length_mm, 
           color = species)) + 
  geom_point(size = 3) + 
  labs(title = "Two Predictor Variables",
       subtitle = "50mm-long bill and 195mm-long flipper",
       caption = "Data Science Learning Community") +
  scale_color_manual(values = c(chinstrap_color, gentoo_color)) +
  theme_minimal() +
  theme(plot.title = element_markdown(face = "bold", size = 24),
        plot.subtitle = element_markdown(size = 16))

train_set <- penguin_2_class |>
  select(flipper_length_mm, bill_length_mm)

pca_results <- prcomp(train_set, center = TRUE, scale. = TRUE)
PCA math
del_x <- pca_results$rotation[1,1]
del_y <- pca_results$rotation[2,1]
pca_slope <- del_y / del_x

xbar <- mean(train_set$flipper_length_mm, na.rm = TRUE)
ybar <- mean(train_set$bill_length_mm, na.rm = TRUE)
pca_intercept <- ybar - pca_slope * xbar

pca_plot_1 <- penguin_2_class |>
ggplot(aes(x = flipper_length_mm, y = bill_length_mm)) + 
  geom_point(size = 3) + 
  geom_abline(slope = pca_slope, intercept = pca_intercept,
              color = adelie_color, linewidth = 3) +
  labs(title = "Principal Component Analysis",
       subtitle = "<span style = 'color:#fb7504'>first principal component</span>",
       caption = "Data Science Learning Community") +
  # scale_color_manual(values = c(chinstrap_color, gentoo_color)) +
  theme_minimal() +
  theme(plot.title = element_markdown(face = "bold", size = 14),
        plot.subtitle = element_markdown(size = 12))

pca_plot_1

  • PC1 captures variance of the entire data set
Projection math
train_mat <- as.matrix(train_set)
proj_mat  <- as.matrix(pca_results$rotation[,1])
projection_data <- train_mat %*% proj_mat
projection_df <- cbind(penguin_2_class, projection_data)

pca_plot_2 <- projection_df |>
  ggplot(aes(x = projection_data)) +
  geom_density(aes(fill = species),
               alpha = 0.5) + 
  labs(title = "Classification via <br><span style = 'color:#fb7504'>Principal Component Analysis</span>",
       subtitle = "",
       caption = "Data Science Learning Community",
       x = "(PC1) first principal component",
       y = "") +
  scale_fill_manual(values = c(chinstrap_color, gentoo_color)) +
  theme_minimal() +
  theme(axis.title.y  = element_blank(),
        axis.text.y   = element_blank(),
        axis.ticks.y  = element_blank(),
        plot.title    = element_markdown(face = "bold", size = 24),
        plot.subtitle = element_markdown(size = 16))

pca_plot_2