16.2 Setting enviroment
16.2.1 Loading libraries
# Helper packages
library(dplyr) # for data wrangling
library(ggplot2) # for awesome graphics
# Modeling packages
library(h2o) # for interfacing with H2O
library(recipes) # for ML recipes
library(rsample) # for data splitting
library(xgboost) # for fitting GBMs
# Model interpretability packages
library(pdp) # for partial dependence plots (and ICE curves)
library(vip) # for variable importance plots
library(iml) # for general IML-related functions
library(DALEX) # for general IML-related functions
library(lime) # for local interpretable model-agnostic explanations
16.2.2 Getting the data
# Load and split the Ames housing data
<- AmesHousing::make_ames()
ames
# for reproducibility
set.seed(123)
<- initial_split(ames, strata = "Sale_Price")
split <- training(split)
ames_train <- testing(split)
ames_test
# Make sure we have consistent categorical levels
<- recipe(Sale_Price ~ .,
blueprint data = ames_train) %>%
step_other(all_nominal(),
threshold = 0.005)
# Starting H2O
h2o.init(max_mem_size = "10g")
options(timeout = 5000)
# Create training set for h2o
<- prep(blueprint,
train_h2o training = ames_train,
retain = TRUE) %>%
juice() %>%
as.h2o()
# Create testing set for h2o
<- prep(blueprint,
test_h2o training = ames_train) %>%
bake(new_data = ames_test) %>%
as.h2o()
# Get response and feature names
<- "Sale_Price"
Y <- setdiff(names(ames_train), Y) X
16.2.3 Training the model
# Regularized regression base learner
<- h2o.glm(
best_glm x = X,
y = Y,
training_frame = train_h2o,
alpha = 0.1,
remove_collinear_columns = TRUE,
nfolds = 10,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
seed = 123
)
# Random forest base learner
<- h2o.randomForest(
best_rf x = X,
y = Y,
training_frame = train_h2o,
ntrees = 500,
mtries = 20,
max_depth = 30,
min_rows = 100,
sample_rate = 0.8,
nfolds = 10,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
seed = 123,
stopping_rounds = 50,
stopping_metric = "RMSE",
stopping_tolerance = 0
)
# GBM base learner
<- h2o.gbm(
best_gbm x = X,
y = Y,
training_frame = train_h2o,
ntrees = 500,
learn_rate = 0.01,
max_depth = 7,
min_rows = 5,
sample_rate = 0.8,
nfolds = 10,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
seed = 123,
stopping_rounds = 50,
stopping_metric = "RMSE",
stopping_tolerance = 0
)
# XGBoost base learner
# We cannot run this under Windows
# Stacked model
<- h2o.stackedEnsemble(
ensemble_tree x = X,
y = Y,
training_frame = train_h2o,
model_id = "my_tree_ensemble",
base_models = list(best_glm, best_rf, best_gbm),
# Meta learner: random forest
metalearner_algorithm = "drf"
)
16.2.4 Defining Local Observations to Explain
# Compute predictions
<- predict(ensemble_tree,
predictions |>
train_h2o) as.vector()
max(predictions) |>
::dollar() |>
scalespaste("Observation",
which.max(predictions),
"has a predicted sale price of",
a = _)
min(predictions) |>
::dollar() |>
scalespaste("Observation",
which.min(predictions),
"has a predicted sale price of",
a = _)
# Grab feature values for observations with min/max predicted sales price
<- as.data.frame(train_h2o)[which.max(predictions), ] |>
high_ob select(-Sale_Price)
<- as.data.frame(train_h2o)[which.min(predictions), ] |>
low_ob select(-Sale_Price)