16.4 Prepare the beans data using recipes

Get the ingredients (recipe()): specify the response variable and predictor variables

Write the recipe (step_zzz()): define the pre-processing steps, such as imputation, creating dummy variables, scaling, and more

Prepare the recipe (prep()): provide a dataset to base each step on (e.g. if one of the steps is to remove variables that only have one unique value, then you need to give it a dataset so it can decide which variables satisfy this criteria to ensure that it is doing the same thing to every dataset you apply it to)

Bake the recipe (bake()): apply the pre-processing steps to your datasets

Using the recipes package for easy pre-processing

library(ggforce)
library(bestNormalize)
library(learntidymodels)
library(embed)

set.seed(1701)
bean_split <- initial_split(beans, strata = class, prop = 3/4)

bean_train <- training(bean_split)
bean_test  <- testing(bean_split)

set.seed(1702)
bean_val <- validation_split(bean_train, strata = class, prop = 4/5)
bean_val$splits[[1]]
## <Training/Validation/Total>
## <8163/2044/10207>
bean_rec <-
  # Use the training data from the bean_val split object
  # 1. get the ingredients
  recipe(class ~ ., data = analysis(bean_val$splits[[1]])) %>%
  # 2. write the recipe
  step_zv(all_numeric_predictors()) %>%
  step_orderNorm(all_numeric_predictors()) %>% 
  step_normalize(all_numeric_predictors())

# 3. prepare the recipe
bean_rec_trained <- prep(bean_rec)

show_variables <- 
  bean_rec %>% 
  prep(log_changes = TRUE)
## step_zv (zv_SqX2i): same number of columns
## 
## step_orderNorm (orderNorm_x4c8K): same number of columns
## 
## step_normalize (normalize_NF9ZV): same number of columns
bean_validation <- bean_val$splits %>% pluck(1) %>% assessment()

# 4. bake the recipe
bean_val_processed <- bake(bean_rec_trained, new_data = bean_validation)

plot_validation_results <- function(recipe, dat = assessment(bean_val$splits[[1]])) {
  recipe %>%
    # Estimate any additional steps
    prep() %>%
    # Process the data (the validation set by default)
    bake(new_data = dat) %>%
    # Create the scatterplot matrix
    ggplot(aes(x = .panel_x, y = .panel_y, col = class, fill = class)) +
    geom_point(alpha = 0.4, size = 0.5) +
    geom_autodensity(alpha = .3) +
    facet_matrix(vars(-class), layer.diag = 2) + 
    scale_color_brewer(palette = "Dark2") + 
    scale_fill_brewer(palette = "Dark2")
}

Some examples of recipe steps: