16.2 Setting enviroment
16.2.1 Loading libraries
# Helper packages
library(dplyr) # for data wrangling
library(ggplot2) # for awesome graphics
# Modeling packages
library(h2o) # for interfacing with H2O
library(recipes) # for ML recipes
library(rsample) # for data splitting
library(xgboost) # for fitting GBMs
# Model interpretability packages
library(pdp) # for partial dependence plots (and ICE curves)
library(vip) # for variable importance plots
library(iml) # for general IML-related functions
library(DALEX) # for general IML-related functions
library(lime) # for local interpretable model-agnostic explanations16.2.2 Getting the data
# Load and split the Ames housing data
ames <- AmesHousing::make_ames()
# for reproducibility
set.seed(123)
split <- initial_split(ames, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
# Make sure we have consistent categorical levels
blueprint <- recipe(Sale_Price ~ .,
data = ames_train) %>%
step_other(all_nominal(),
threshold = 0.005)
# Starting H2O
h2o.init(max_mem_size = "10g")
options(timeout = 5000)
# Create training set for h2o
train_h2o <- prep(blueprint,
training = ames_train,
retain = TRUE) %>%
juice() %>%
as.h2o()
# Create testing set for h2o
test_h2o <- prep(blueprint,
training = ames_train) %>%
bake(new_data = ames_test) %>%
as.h2o()
# Get response and feature names
Y <- "Sale_Price"
X <- setdiff(names(ames_train), Y)16.2.3 Training the model
# Regularized regression base learner
best_glm <- h2o.glm(
x = X,
y = Y,
training_frame = train_h2o,
alpha = 0.1,
remove_collinear_columns = TRUE,
nfolds = 10,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
seed = 123
)
# Random forest base learner
best_rf <- h2o.randomForest(
x = X,
y = Y,
training_frame = train_h2o,
ntrees = 500,
mtries = 20,
max_depth = 30,
min_rows = 100,
sample_rate = 0.8,
nfolds = 10,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
seed = 123,
stopping_rounds = 50,
stopping_metric = "RMSE",
stopping_tolerance = 0
)
# GBM base learner
best_gbm <- h2o.gbm(
x = X,
y = Y,
training_frame = train_h2o,
ntrees = 500,
learn_rate = 0.01,
max_depth = 7,
min_rows = 5,
sample_rate = 0.8,
nfolds = 10,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
seed = 123,
stopping_rounds = 50,
stopping_metric = "RMSE",
stopping_tolerance = 0
)
# XGBoost base learner
# We cannot run this under Windows
# Stacked model
ensemble_tree <- h2o.stackedEnsemble(
x = X,
y = Y,
training_frame = train_h2o,
model_id = "my_tree_ensemble",
base_models = list(best_glm, best_rf, best_gbm),
# Meta learner: random forest
metalearner_algorithm = "drf"
)16.2.4 Defining Local Observations to Explain
# Compute predictions
predictions <- predict(ensemble_tree,
train_h2o) |>
as.vector()
max(predictions) |>
scales::dollar() |>
paste("Observation",
which.max(predictions),
"has a predicted sale price of",
a = _)
min(predictions) |>
scales::dollar() |>
paste("Observation",
which.min(predictions),
"has a predicted sale price of",
a = _)
# Grab feature values for observations with min/max predicted sales price
high_ob <- as.data.frame(train_h2o)[which.max(predictions), ] |>
select(-Sale_Price)
low_ob <- as.data.frame(train_h2o)[which.min(predictions), ] |>
select(-Sale_Price)