16.2 Setting enviroment

16.2.1 Loading libraries

# Helper packages
library(dplyr)      # for data wrangling
library(ggplot2)    # for awesome graphics

# Modeling packages
library(h2o)       # for interfacing with H2O
library(recipes)   # for ML recipes
library(rsample)   # for data splitting
library(xgboost)   # for fitting GBMs

# Model interpretability packages
library(pdp)       # for partial dependence plots (and ICE curves)
library(vip)       # for variable importance plots
library(iml)       # for general IML-related functions
library(DALEX)     # for general IML-related functions
library(lime)      # for local interpretable model-agnostic explanations

16.2.2 Getting the data

# Load and split the Ames housing data
ames <- AmesHousing::make_ames()

# for reproducibility
set.seed(123)  
split <- initial_split(ames, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)

# Make sure we have consistent categorical levels
blueprint <- recipe(Sale_Price ~ .,
                    data = ames_train) %>%
  step_other(all_nominal(),
             threshold = 0.005)

# Starting H2O
h2o.init(max_mem_size = "10g")
options(timeout = 5000)

# Create training set for h2o
train_h2o <- prep(blueprint,
                  training = ames_train,
                  retain = TRUE) %>%
  juice() %>%
  as.h2o()

# Create testing set for h2o
test_h2o <- prep(blueprint,
                 training = ames_train) %>%
  bake(new_data = ames_test) %>%
  as.h2o()

# Get response and feature names
Y <- "Sale_Price"
X <- setdiff(names(ames_train), Y)

16.2.3 Training the model

# Regularized regression base learner
best_glm <- h2o.glm(
  x = X,
  y = Y,
  training_frame = train_h2o,
  alpha = 0.1,
  remove_collinear_columns = TRUE,
  nfolds = 10,
  fold_assignment = "Modulo",
  keep_cross_validation_predictions = TRUE,
  seed = 123
)

# Random forest base learner
best_rf <- h2o.randomForest(
  x = X,
  y = Y,
  training_frame = train_h2o,
  ntrees = 500,
  mtries = 20,
  max_depth = 30,
  min_rows = 100,
  sample_rate = 0.8,
  nfolds = 10,
  fold_assignment = "Modulo",
  keep_cross_validation_predictions = TRUE,
  seed = 123,
  stopping_rounds = 50,
  stopping_metric = "RMSE",
  stopping_tolerance = 0
)


# GBM base learner
best_gbm <- h2o.gbm(
  x = X,
  y = Y,
  training_frame = train_h2o,
  ntrees = 500,
  learn_rate = 0.01,
  max_depth = 7,
  min_rows = 5,
  sample_rate = 0.8,
  nfolds = 10,
  fold_assignment = "Modulo",
  keep_cross_validation_predictions = TRUE,
  seed = 123,
  stopping_rounds = 50,
  stopping_metric = "RMSE",
  stopping_tolerance = 0
)

# XGBoost base learner
#   We cannot run this under Windows

# Stacked model
ensemble_tree <- h2o.stackedEnsemble(
  x = X,
  y = Y,
  training_frame = train_h2o,
  model_id = "my_tree_ensemble",
  base_models = list(best_glm, best_rf, best_gbm),
  # Meta learner: random forest
  metalearner_algorithm = "drf"
)

16.2.4 Defining Local Observations to Explain

# Compute predictions
predictions <- predict(ensemble_tree,
                       train_h2o) |> 
  as.vector()

max(predictions) |>
  scales::dollar() |>
  paste("Observation",
        which.max(predictions), 
        "has a predicted sale price of",
        a = _)

min(predictions) |>
  scales::dollar() |>
  paste("Observation",
        which.min(predictions), 
        "has a predicted sale price of",
        a = _) 


# Grab feature values for observations with min/max predicted sales price
high_ob <- as.data.frame(train_h2o)[which.max(predictions), ] |> 
  select(-Sale_Price)

low_ob  <- as.data.frame(train_h2o)[which.min(predictions), ] |>
  select(-Sale_Price)