16.4 Prepare the beans data using recipes
Get the ingredients (
recipe()): specify the response variable and predictor variablesWrite the recipe (
step_zzz()): define the pre-processing steps, such as imputation, creating dummy variables, scaling, and morePrepare the recipe (
prep()): provide a dataset to base each step on (e.g. if one of the steps is to remove variables that only have one unique value, then you need to give it a dataset so it can decide which variables satisfy this criteria to ensure that it is doing the same thing to every dataset you apply it to)Bake the recipe (
bake()): apply the pre-processing steps to your datasets
Using the recipes package for easy pre-processing
library(ggforce)
library(bestNormalize)
library(learntidymodels)
library(embed)
set.seed(1701)
bean_split <- initial_split(beans, strata = class, prop = 3/4)
bean_train <- training(bean_split)
bean_test <- testing(bean_split)
set.seed(1702)
bean_val <- validation_split(bean_train, strata = class, prop = 4/5)
bean_val$splits[[1]]## <Training/Validation/Total>
## <8163/2044/10207>
bean_rec <-
# Use the training data from the bean_val split object
# 1. get the ingredients
recipe(class ~ ., data = analysis(bean_val$splits[[1]])) %>%
# 2. write the recipe
step_zv(all_numeric_predictors()) %>%
step_orderNorm(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
# 3. prepare the recipe
bean_rec_trained <- prep(bean_rec)
show_variables <-
bean_rec %>%
prep(log_changes = TRUE)## step_zv (zv_SqX2i): same number of columns
##
## step_orderNorm (orderNorm_x4c8K): same number of columns
##
## step_normalize (normalize_NF9ZV): same number of columns
bean_validation <- bean_val$splits %>% pluck(1) %>% assessment()
# 4. bake the recipe
bean_val_processed <- bake(bean_rec_trained, new_data = bean_validation)
plot_validation_results <- function(recipe, dat = assessment(bean_val$splits[[1]])) {
recipe %>%
# Estimate any additional steps
prep() %>%
# Process the data (the validation set by default)
bake(new_data = dat) %>%
# Create the scatterplot matrix
ggplot(aes(x = .panel_x, y = .panel_y, col = class, fill = class)) +
geom_point(alpha = 0.4, size = 0.5) +
geom_autodensity(alpha = .3) +
facet_matrix(vars(-class), layer.diag = 2) +
scale_color_brewer(palette = "Dark2") +
scale_fill_brewer(palette = "Dark2")
}Some examples of recipe steps: