29.10 Feature Engineering

daily <- flights %>% 
  mutate(date = make_date(year, month, day)) %>% 
  group_by(date) %>% 
  summarise(n = n())

ggplot(daily) +
  aes(date, n) + 
  geom_line()

Feature engineering = using data to create new features to use in models

daily <- daily %>% 
  mutate(wday = wday(date, label = TRUE, week_start = 1))
ggplot(daily) +
  aes(wday, n) + 
  geom_boxplot()
mod <- lm(n ~ wday, data = daily)

grid <- daily %>% 
  data_grid(wday) %>% 
  add_predictions(mod, "n")

ggplot(daily) +
  aes(wday, n) + 
  geom_boxplot() +
  geom_point(data = grid, colour = "red", size = 4)
daily <- daily %>% 
  add_residuals(mod)

base_plot <- ggplot(daily) + 
  aes(date, resid) +
  geom_ref_line(h = 0) + 
  geom_line()

base_plot

base_plot +
  aes(color = wday)

base_plot + 
  geom_smooth(se = FALSE, span = 0.20)
daily %>% 
  filter(resid < -100) %>% 
  pull(date, wday)
term <- function(date) {
  cut(date, 
    breaks = ymd(20130101, 20130605, 20130825, 20140101),
    labels = c("spring", "summer", "fall") 
  )
}

daily <- daily %>% 
  mutate(term = term(date)) 

mod2 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>% 
  add_residuals(mod2, "resid") %>% 
  ggplot() + 
  aes(date, resid) +
  geom_hline(yintercept = 0, linewidth = 2, colour = "white") + 
  geom_line()