15.2 Creating workflow_sets | Modelado Tidy con R

15.2 Creating `workflow_set`s

seed <- 2021
col_y <- 'ladder_score'
col_y_sym <- col_y %>% sym()

set.seed(seed)
split <- df_selected %>% initial_split(strata = !!col_y_sym)
df_trn <- split %>% training()
df_tst  <- split %>% testing()

folds <-
  df_trn %>% 
  vfold_cv(strata = !!col_y_sym, repeats = 5)
folds

## #  10-fold cross-validation repeated 5 times using stratification 
## # A tibble: 50 × 3
##    splits          id      id2   
##    <list>          <chr>   <chr> 
##  1 <split [97/12]> Repeat1 Fold01
##  2 <split [97/12]> Repeat1 Fold02
##  3 <split [97/12]> Repeat1 Fold03
##  4 <split [97/12]> Repeat1 Fold04
##  5 <split [97/12]> Repeat1 Fold05
##  6 <split [97/12]> Repeat1 Fold06
##  7 <split [97/12]> Repeat1 Fold07
##  8 <split [100/9]> Repeat1 Fold08
##  9 <split [101/8]> Repeat1 Fold09
## 10 <split [101/8]> Repeat1 Fold10
## # ℹ 40 more rows

# My weird way of creating formulas sometimes, which can be helpful if you're experimenting with different response variables.
form <- paste0(col_y, '~ .') %>% as.formula()
rec_norm <-
  df_trn %>% 
  recipe(form, data = .) %>% 
  step_normalize(all_predictors())

rec_poly <-
  rec_norm %>% 
  step_poly(all_predictors()) %>% 
  step_interact(~ all_predictors():all_predictors())
rec_poly

Code for recipes…

library(rules)
library(baguette)
f_set <- function(spec) {
  spec %>% 
    set_mode('regression')
}

spec_lr <- 
  linear_reg(penalty = tune(), mixture = tune()) %>% 
  set_engine('glmnet')

spec_mars <- 
  mars(prod_degree = tune()) %>%
  set_engine('earth') %>% 
  f_set()

spec_svm_r <- 
  svm_rbf(cost = tune(), rbf_sigma = tune()) %>% 
  set_engine('kernlab') %>% 
  f_set()

spec_svm_p <- 
  svm_poly(cost = tune(), degree = tune()) %>% 
  set_engine('kernlab') %>% 
  f_set()

spec_knn <- 
  nearest_neighbor(
    neighbors = tune(), 
    dist_power = tune(), 
    weight_func = tune()
  ) %>% 
  set_engine('kknn') %>% 
  f_set()

spec_cart <- 
  decision_tree(cost_complexity = tune(), min_n = tune()) %>% 
  set_engine('rpart') %>% 
  f_set()

spec_cart_bag <- 
  bag_tree() %>% 
  set_engine('rpart', times = 50L) %>% 
  f_set()

spec_rf <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 200L) %>% 
  set_engine('ranger') %>% 
  f_set()

spec_xgb <- 
  boost_tree(
    tree_depth = tune(),
    learn_rate = tune(), 
    loss_reduction = tune(), 
    min_n = tune(), 
    sample_size = tune(), 
    trees = 200L
  ) %>% 
  set_engine('xgboost') %>% 
  f_set()

spec_cube <- 
  cubist_rules(committees = tune(), neighbors = tune()) %>% 
  set_engine('Cubist')

How I felt after creating 10 recipes

We can create workflow_sets, combining the recipes that standardizes the predictors with the non-linear models that work best when predictors are all on the same scale.

library(workflowsets)

sets_norm <- 
  workflow_set(
    preproc = list(norm = rec_norm), 
    models = list(
      svm_r = spec_svm_r, 
      svm_p = spec_svm_p, 
      knn = spec_knn
    )
  )
sets_norm

## # A workflow set/tibble: 3 × 4
##   wflow_id   info             option    result    
##   <chr>      <list>           <list>    <list>    
## 1 norm_svm_r <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 norm_svm_p <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 norm_knn   <tibble [1 × 4]> <opts[0]> <list [0]>

Let’s apply the quadratic pre-processing to models where it is most applicable.

sets_poly <- 
  workflow_set(
    preproc = list(poly = rec_poly), 
    models = list(lr = spec_lr, knn = spec_knn)
  )

Finally, there are several recipes that don’t really need pre-processing. Nonetheless, we need to have a preproc step, so we can use workflowsets::workflow_variables() for a dummy pre-processing step.

sets_simple <- 
  workflow_set(
    preproc = list(form),
    models = 
      list(
        mars = spec_mars, 
        cart = spec_cart, 
        cart_bag = spec_cart_bag,
        rf = spec_rf, 
        gb = spec_xgb, 
        cube = spec_cube
      )
  )
sets_simple

## # A workflow set/tibble: 6 × 4
##   wflow_id         info             option    result    
##   <chr>            <list>           <list>    <list>    
## 1 formula_mars     <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 formula_cart     <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 formula_cart_bag <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 formula_rf       <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 formula_gb       <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 formula_cube     <tibble [1 × 4]> <opts[0]> <list [0]>

We can bind all of our workflow_sets together.

sets <-
  bind_rows(sets_norm, sets_poly, sets_simple) %>% 
  mutate(across(wflow_id, ~str_remove(.x, '^simple_')))
sets

## # A workflow set/tibble: 11 × 4
##    wflow_id         info             option    result    
##    <chr>            <list>           <list>    <list>    
##  1 norm_svm_r       <tibble [1 × 4]> <opts[0]> <list [0]>
##  2 norm_svm_p       <tibble [1 × 4]> <opts[0]> <list [0]>
##  3 norm_knn         <tibble [1 × 4]> <opts[0]> <list [0]>
##  4 poly_lr          <tibble [1 × 4]> <opts[0]> <list [0]>
##  5 poly_knn         <tibble [1 × 4]> <opts[0]> <list [0]>
##  6 formula_mars     <tibble [1 × 4]> <opts[0]> <list [0]>
##  7 formula_cart     <tibble [1 × 4]> <opts[0]> <list [0]>
##  8 formula_cart_bag <tibble [1 × 4]> <opts[0]> <list [0]>
##  9 formula_rf       <tibble [1 × 4]> <opts[0]> <list [0]>
## 10 formula_gb       <tibble [1 × 4]> <opts[0]> <list [0]>
## 11 formula_cube     <tibble [1 × 4]> <opts[0]> <list [0]>

And do the thing! (Observe the elegance.)

ctrl_grid <-
  control_grid(
    save_pred = TRUE,
    parallel_over = 'everything',
    save_workflow = TRUE
  )

res_grid <-
  sets %>%
  workflow_map(
    seed = seed,
    resamples = folds,
    grid = 3,
    control = ctrl_grid,
    verbose = TRUE
  )

How I felt waiting for this to finish running