15.2 Creating workflow_set
s
<- 2021
seed <- 'ladder_score'
col_y <- col_y %>% sym()
col_y_sym
set.seed(seed)
<- df_selected %>% initial_split(strata = !!col_y_sym)
split <- split %>% training()
df_trn <- split %>% testing()
df_tst
<-
folds %>%
df_trn vfold_cv(strata = !!col_y_sym, repeats = 5)
folds
## # 10-fold cross-validation repeated 5 times using stratification
## # A tibble: 50 × 3
## splits id id2
## <list> <chr> <chr>
## 1 <split [97/12]> Repeat1 Fold01
## 2 <split [97/12]> Repeat1 Fold02
## 3 <split [97/12]> Repeat1 Fold03
## 4 <split [97/12]> Repeat1 Fold04
## 5 <split [97/12]> Repeat1 Fold05
## 6 <split [97/12]> Repeat1 Fold06
## 7 <split [97/12]> Repeat1 Fold07
## 8 <split [100/9]> Repeat1 Fold08
## 9 <split [101/8]> Repeat1 Fold09
## 10 <split [101/8]> Repeat1 Fold10
## # ℹ 40 more rows
# My weird way of creating formulas sometimes, which can be helpful if you're experimenting with different response variables.
<- paste0(col_y, '~ .') %>% as.formula()
form <-
rec_norm %>%
df_trn recipe(form, data = .) %>%
step_normalize(all_predictors())
<-
rec_poly %>%
rec_norm step_poly(all_predictors()) %>%
step_interact(~ all_predictors():all_predictors())
rec_poly
Code for recipes…
library(rules)
library(baguette)
<- function(spec) {
f_set %>%
spec set_mode('regression')
}
<-
spec_lr linear_reg(penalty = tune(), mixture = tune()) %>%
set_engine('glmnet')
<-
spec_mars mars(prod_degree = tune()) %>%
set_engine('earth') %>%
f_set()
<-
spec_svm_r svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
set_engine('kernlab') %>%
f_set()
<-
spec_svm_p svm_poly(cost = tune(), degree = tune()) %>%
set_engine('kernlab') %>%
f_set()
<-
spec_knn nearest_neighbor(
neighbors = tune(),
dist_power = tune(),
weight_func = tune()
%>%
) set_engine('kknn') %>%
f_set()
<-
spec_cart decision_tree(cost_complexity = tune(), min_n = tune()) %>%
set_engine('rpart') %>%
f_set()
<-
spec_cart_bag bag_tree() %>%
set_engine('rpart', times = 50L) %>%
f_set()
<-
spec_rf rand_forest(mtry = tune(), min_n = tune(), trees = 200L) %>%
set_engine('ranger') %>%
f_set()
<-
spec_xgb boost_tree(
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune(),
min_n = tune(),
sample_size = tune(),
trees = 200L
%>%
) set_engine('xgboost') %>%
f_set()
<-
spec_cube cubist_rules(committees = tune(), neighbors = tune()) %>%
set_engine('Cubist')
How I felt after creating 10 recipes
We can create workflow_set
s, combining the recipes that standardizes the predictors with the non-linear models that work best when predictors are all on the same scale.
library(workflowsets)
<-
sets_norm workflow_set(
preproc = list(norm = rec_norm),
models = list(
svm_r = spec_svm_r,
svm_p = spec_svm_p,
knn = spec_knn
)
) sets_norm
## # A workflow set/tibble: 3 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 norm_svm_r <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 norm_svm_p <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 norm_knn <tibble [1 × 4]> <opts[0]> <list [0]>
Let’s apply the quadratic pre-processing to models where it is most applicable.
<-
sets_poly workflow_set(
preproc = list(poly = rec_poly),
models = list(lr = spec_lr, knn = spec_knn)
)
Finally, there are several recipes that don’t really need pre-processing. Nonetheless, we need to have a preproc
step, so we can use workflowsets::workflow_variables()
for a dummy pre-processing step.
<-
sets_simple workflow_set(
preproc = list(form),
models =
list(
mars = spec_mars,
cart = spec_cart,
cart_bag = spec_cart_bag,
rf = spec_rf,
gb = spec_xgb,
cube = spec_cube
)
) sets_simple
## # A workflow set/tibble: 6 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 formula_mars <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 formula_cart <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 formula_cart_bag <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 formula_rf <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 formula_gb <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 formula_cube <tibble [1 × 4]> <opts[0]> <list [0]>
We can bind all of our workflow_set
s together.
<-
sets bind_rows(sets_norm, sets_poly, sets_simple) %>%
mutate(across(wflow_id, ~str_remove(.x, '^simple_')))
sets
## # A workflow set/tibble: 11 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 norm_svm_r <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 norm_svm_p <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 norm_knn <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 poly_lr <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 poly_knn <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 formula_mars <tibble [1 × 4]> <opts[0]> <list [0]>
## 7 formula_cart <tibble [1 × 4]> <opts[0]> <list [0]>
## 8 formula_cart_bag <tibble [1 × 4]> <opts[0]> <list [0]>
## 9 formula_rf <tibble [1 × 4]> <opts[0]> <list [0]>
## 10 formula_gb <tibble [1 × 4]> <opts[0]> <list [0]>
## 11 formula_cube <tibble [1 × 4]> <opts[0]> <list [0]>
And do the thing! (Observe the elegance.)
<-
ctrl_grid control_grid(
save_pred = TRUE,
parallel_over = 'everything',
save_workflow = TRUE
)
<-
res_grid %>%
sets workflow_map(
seed = seed,
resamples = folds,
grid = 3,
control = ctrl_grid,
verbose = TRUE
)