15.2 Creating workflow_sets
seed <- 2021
col_y <- 'ladder_score'
col_y_sym <- col_y %>% sym()
set.seed(seed)
split <- df_selected %>% initial_split(strata = !!col_y_sym)
df_trn <- split %>% training()
df_tst <- split %>% testing()
folds <-
df_trn %>%
vfold_cv(strata = !!col_y_sym, repeats = 5)
folds## # 10-fold cross-validation repeated 5 times using stratification
## # A tibble: 50 × 3
## splits id id2
## <list> <chr> <chr>
## 1 <split [97/12]> Repeat1 Fold01
## 2 <split [97/12]> Repeat1 Fold02
## 3 <split [97/12]> Repeat1 Fold03
## 4 <split [97/12]> Repeat1 Fold04
## 5 <split [97/12]> Repeat1 Fold05
## 6 <split [97/12]> Repeat1 Fold06
## 7 <split [97/12]> Repeat1 Fold07
## 8 <split [100/9]> Repeat1 Fold08
## 9 <split [101/8]> Repeat1 Fold09
## 10 <split [101/8]> Repeat1 Fold10
## # ℹ 40 more rows
# My weird way of creating formulas sometimes, which can be helpful if you're experimenting with different response variables.
form <- paste0(col_y, '~ .') %>% as.formula()
rec_norm <-
df_trn %>%
recipe(form, data = .) %>%
step_normalize(all_predictors())
rec_poly <-
rec_norm %>%
step_poly(all_predictors()) %>%
step_interact(~ all_predictors():all_predictors())
rec_poly
Code for recipes…
library(rules)
library(baguette)
f_set <- function(spec) {
spec %>%
set_mode('regression')
}
spec_lr <-
linear_reg(penalty = tune(), mixture = tune()) %>%
set_engine('glmnet')
spec_mars <-
mars(prod_degree = tune()) %>%
set_engine('earth') %>%
f_set()
spec_svm_r <-
svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
set_engine('kernlab') %>%
f_set()
spec_svm_p <-
svm_poly(cost = tune(), degree = tune()) %>%
set_engine('kernlab') %>%
f_set()
spec_knn <-
nearest_neighbor(
neighbors = tune(),
dist_power = tune(),
weight_func = tune()
) %>%
set_engine('kknn') %>%
f_set()
spec_cart <-
decision_tree(cost_complexity = tune(), min_n = tune()) %>%
set_engine('rpart') %>%
f_set()
spec_cart_bag <-
bag_tree() %>%
set_engine('rpart', times = 50L) %>%
f_set()
spec_rf <-
rand_forest(mtry = tune(), min_n = tune(), trees = 200L) %>%
set_engine('ranger') %>%
f_set()
spec_xgb <-
boost_tree(
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune(),
min_n = tune(),
sample_size = tune(),
trees = 200L
) %>%
set_engine('xgboost') %>%
f_set()
spec_cube <-
cubist_rules(committees = tune(), neighbors = tune()) %>%
set_engine('Cubist')
How I felt after creating 10 recipes
We can create workflow_sets, combining the recipes that standardizes the predictors with the non-linear models that work best when predictors are all on the same scale.
library(workflowsets)
sets_norm <-
workflow_set(
preproc = list(norm = rec_norm),
models = list(
svm_r = spec_svm_r,
svm_p = spec_svm_p,
knn = spec_knn
)
)
sets_norm## # A workflow set/tibble: 3 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 norm_svm_r <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 norm_svm_p <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 norm_knn <tibble [1 × 4]> <opts[0]> <list [0]>
Let’s apply the quadratic pre-processing to models where it is most applicable.
sets_poly <-
workflow_set(
preproc = list(poly = rec_poly),
models = list(lr = spec_lr, knn = spec_knn)
)Finally, there are several recipes that don’t really need pre-processing. Nonetheless, we need to have a preproc step, so we can use workflowsets::workflow_variables() for a dummy pre-processing step.
sets_simple <-
workflow_set(
preproc = list(form),
models =
list(
mars = spec_mars,
cart = spec_cart,
cart_bag = spec_cart_bag,
rf = spec_rf,
gb = spec_xgb,
cube = spec_cube
)
)
sets_simple## # A workflow set/tibble: 6 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 formula_mars <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 formula_cart <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 formula_cart_bag <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 formula_rf <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 formula_gb <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 formula_cube <tibble [1 × 4]> <opts[0]> <list [0]>
We can bind all of our workflow_sets together.
sets <-
bind_rows(sets_norm, sets_poly, sets_simple) %>%
mutate(across(wflow_id, ~str_remove(.x, '^simple_')))
sets## # A workflow set/tibble: 11 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 norm_svm_r <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 norm_svm_p <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 norm_knn <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 poly_lr <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 poly_knn <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 formula_mars <tibble [1 × 4]> <opts[0]> <list [0]>
## 7 formula_cart <tibble [1 × 4]> <opts[0]> <list [0]>
## 8 formula_cart_bag <tibble [1 × 4]> <opts[0]> <list [0]>
## 9 formula_rf <tibble [1 × 4]> <opts[0]> <list [0]>
## 10 formula_gb <tibble [1 × 4]> <opts[0]> <list [0]>
## 11 formula_cube <tibble [1 × 4]> <opts[0]> <list [0]>
And do the thing! (Observe the elegance.)
ctrl_grid <-
control_grid(
save_pred = TRUE,
parallel_over = 'everything',
save_workflow = TRUE
)
res_grid <-
sets %>%
workflow_map(
seed = seed,
resamples = folds,
grid = 3,
control = ctrl_grid,
verbose = TRUE
)
How I felt waiting for this to finish running
