16.4 Prepare the beans data using recipes
Get the ingredients (
recipe()
): specify the response variable and predictor variablesWrite the recipe (
step_zzz()
): define the pre-processing steps, such as imputation, creating dummy variables, scaling, and morePrepare the recipe (
prep()
): provide a dataset to base each step on (e.g. if one of the steps is to remove variables that only have one unique value, then you need to give it a dataset so it can decide which variables satisfy this criteria to ensure that it is doing the same thing to every dataset you apply it to)Bake the recipe (
bake()
): apply the pre-processing steps to your datasets
Using the recipes package for easy pre-processing
library(ggforce)
library(bestNormalize)
library(learntidymodels)
library(embed)
set.seed(1701)
<- initial_split(beans, strata = class, prop = 3/4)
bean_split
<- training(bean_split)
bean_train <- testing(bean_split)
bean_test
set.seed(1702)
<- validation_split(bean_train, strata = class, prop = 4/5)
bean_val $splits[[1]] bean_val
## <Training/Validation/Total>
## <8163/2044/10207>
<-
bean_rec # Use the training data from the bean_val split object
# 1. get the ingredients
recipe(class ~ ., data = analysis(bean_val$splits[[1]])) %>%
# 2. write the recipe
step_zv(all_numeric_predictors()) %>%
step_orderNorm(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
# 3. prepare the recipe
<- prep(bean_rec)
bean_rec_trained
<-
show_variables %>%
bean_rec prep(log_changes = TRUE)
## step_zv (zv_SqX2i): same number of columns
##
## step_orderNorm (orderNorm_x4c8K): same number of columns
##
## step_normalize (normalize_NF9ZV): same number of columns
<- bean_val$splits %>% pluck(1) %>% assessment()
bean_validation
# 4. bake the recipe
<- bake(bean_rec_trained, new_data = bean_validation)
bean_val_processed
<- function(recipe, dat = assessment(bean_val$splits[[1]])) {
plot_validation_results %>%
recipe # Estimate any additional steps
prep() %>%
# Process the data (the validation set by default)
bake(new_data = dat) %>%
# Create the scatterplot matrix
ggplot(aes(x = .panel_x, y = .panel_y, col = class, fill = class)) +
geom_point(alpha = 0.4, size = 0.5) +
geom_autodensity(alpha = .3) +
facet_matrix(vars(-class), layer.diag = 2) +
scale_color_brewer(palette = "Dark2") +
scale_fill_brewer(palette = "Dark2")
}
Some examples of recipe steps: