15.4 Alternative Model Training Process

Rather than using a tuned base learner we could stack multiple models generated from the same base learner and allow the super learner to perform to tuning process.

Create a grid of paraterms to tune.

# Define GBM hyperparameter grid
hyper_grid <- list(
  max_depth = c(1, 3, 5),
  min_rows = c(1, 5, 10),
  learn_rate = c(0.01, 0.05, 0.1),
  learn_rate_annealing = c(0.99, 1),
  sample_rate = c(0.5, 0.75, 1),
  col_sample_rate = c(0.8, 0.9, 1)
)

Define control parameters

search_criteria <- list(
  # Perform a random search of all the combinations
  strategy = "RandomDiscrete",
  # And stop after reaching the maximum number of models
  max_models = 25
)

Measure the performance of each model

# Build random grid search 
random_grid <- h2o.grid(
  algorithm = "gbm", 
  grid_id = "gbm_grid", 
  x = X, 
  y = Y,
  training_frame = train_h2o,
  hyper_params = hyper_grid,
  search_criteria = search_criteria,
  ntrees = 5000,
  stopping_metric = "RMSE",     
  stopping_rounds = 10,
  stopping_tolerance = 0,
  nfolds = 10, 
  fold_assignment = "Modulo",
  keep_cross_validation_predictions = TRUE,
  seed = 123
)

Arrange the models by performance

# Sort results by RMSE
h2o.getGrid(
  grid_id = "gbm_grid", 
  sort_by = "rmse"
)

## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  learn_rate_annealing 
##   -  max_depth 
##   -  min_rows 
##   -  sample_rate 
## Number of models: 25 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by increasing rmse
##   col_sample_rate learn_rate learn_rate_annealing max_depth min_rows
## 1         0.90000    0.01000              1.00000   5.00000 10.00000
## 2         0.90000    0.01000              1.00000   5.00000  1.00000
## 3         0.80000    0.10000              1.00000   3.00000  5.00000
## 4         0.80000    0.10000              0.99000   3.00000  1.00000
## 5         0.80000    0.05000              1.00000   5.00000  5.00000
##   sample_rate         model_ids        rmse
## 1     0.50000 gbm_grid_model_11 22299.59397
## 2     1.00000  gbm_grid_model_2 23016.72073
## 3     0.75000  gbm_grid_model_1 23058.95350
## 4     0.75000 gbm_grid_model_20 23110.08213
## 5     0.75000  gbm_grid_model_9 23217.62738
## 
## ---
##    col_sample_rate learn_rate learn_rate_annealing max_depth min_rows
## 20         1.00000    0.01000              0.99000   5.00000  5.00000
## 21         0.80000    0.01000              0.99000   5.00000  1.00000
## 22         0.90000    0.01000              0.99000   5.00000 10.00000
## 23         0.90000    0.01000              0.99000   3.00000 10.00000
## 24         0.80000    0.01000              0.99000   1.00000  5.00000
## 25         0.90000    0.01000              0.99000   1.00000 10.00000
##    sample_rate         model_ids        rmse
## 20     0.50000 gbm_grid_model_22 40781.63918
## 21     0.75000 gbm_grid_model_21 40793.61345
## 22     0.75000 gbm_grid_model_23 41133.16192
## 23     1.00000 gbm_grid_model_17 44768.53587
## 24     0.50000 gbm_grid_model_14 57401.59369
## 25     0.50000 gbm_grid_model_12 57405.33471

Train the stacked model

# Train a stacked ensemble using the GBM grid
ensemble <- h2o.stackedEnsemble(
  x = X,
  y = Y,
  training_frame = train_h2o,
  model_id = "ensemble_gbm_grid",
  base_models = random_grid@model_ids,
  metalearner_algorithm = "gbm"
)

Compare the tuned model vs the stacked model

# Tuned model
random_grid@model_ids[[1]] |>
  h2o.getModel() |>
  h2o.performance(newdata = test_h2o)

## H2ORegressionMetrics: gbm
## 
## MSE:  357841477
## RMSE:  18916.7
## MAE:  12116.5
## RMSLE:  0.1004672
## Mean Residual Deviance :  357841477

# Stacked model
h2o.performance(ensemble, newdata = test_h2o)

## H2ORegressionMetrics: stackedensemble
## 
## MSE:  420921340
## RMSE:  20516.37
## MAE:  12952.93
## RMSLE:  0.1035663
## Mean Residual Deviance :  420921340

h2o.shutdown(prompt = FALSE)