9.3 Regression Metrics
Load in the Data
library(tidymodels)
library(glmnet)
library(ranger)
set.seed(1123)
data(ames)
<- ames %>%
ames mutate(
under_budget = as.factor(if_else(Sale_Price<=160000,1,0)),
Sale_Price = log10(Sale_Price))
#Cross-fold validation
<- vfold_cv(ames, v = 5)
ames_folds
#Create Recipe
<-
ames_recipe recipe(formula = Sale_Price ~ Gr_Liv_Area + Full_Bath + Half_Bath + Lot_Area + Neighborhood + Overall_Cond,
data = ames) %>%
step_dummy(all_nominal())
#Set the model and hyperparameters
<-
ames_spec linear_reg(penalty = tune(), mixture = tune()) %>%
set_mode("regression") %>%
set_engine("glmnet")
#Create workflow
<-
ames_workflow workflow() %>%
add_recipe(ames_recipe) %>%
add_model(ames_spec)
#Create metric set of all regression metrics
<-
ames_tune tune_grid(
ames_workflow,metrics =
metric_set(rmse, rsq, rsq_trad, mae, mpe, mape, smape, mase, ccc, rpiq, rpd, huber_loss, huber_loss_pseudo, iic),
resamples = ames_folds,
grid = grid_latin_hypercube(penalty(), mixture(), size = 8)
)
#Pick the best model for each metric and pull out the predictions
<-
best_models tibble(
metric_name = c('rmse', 'rsq', 'rsq_trad', 'mae', 'mpe', 'mape', 'smape', 'mase',
'ccc','rpiq', 'rpd', 'huber_loss', 'huber_loss_pseudo', 'iic')) %>%
mutate(metric_best = map(metric_name, ~select_best(ames_tune, .x)),
wf_best = map(metric_best, ~finalize_workflow(ames_workflow, .x)),
fit_best = map(wf_best, ~fit(.x, data = ames)),
df_pred = map(fit_best, ~ames %>% bind_cols(predict(.x, new_data = ames)) %>% select(Sale_Price, .pred))) %>%
select(-c(wf_best, fit_best)) %>%
unnest(cols = c(metric_name, metric_best, df_pred))
#Plot!
%>%
best_models mutate(metric_desc = factor(
metric_name, levels = c('rmse', 'rsq', 'rsq_trad', 'mae', 'mpe', 'mape', 'smape', 'mase',
'ccc','rpiq', 'rpd', 'huber_loss', 'huber_loss_pseudo', 'iic'),
labels = c('rmse\nwhen you cannot afford\n to have a big error',
'rsq\nwhen you want a measure\n of consistency/correlation\n and not accuracy',
'rsq_trad\n r-sq not constrained\n between 0 and 1',
'mae\nwhen large errors are not\n exponentially\n worse than small errors',
'mpe\nwhen you want an easy way\n to calculate accuracy',
'mape\nwhen you want to use mpe\n with a better\n representation of error',
'smape\nwhen you want to use\n mape expressed as a %',
'mase\nwhen you need a scale\n independent metric\n for time-series data',
'ccc\nwhen you want to measure\n the distance from \nperferct linearity',
'rpiq\nwhen you need a different\n measue of consistency/correlation\n and not accuracy',
'rpd\nwhen you need a different\n measue of consistency/correlation\n and not accuracy',
'huber_loss\nwhen you need a loss\n function less sensitive to outliers',
'huber_loss_pseudo\nwhen you need\n a smoothed version of huber_loss',
'iic\nwhen you need an\n alternative to the traditional\n correlation coefficient'))) %>%
ggplot(aes(x = Sale_Price, y = .pred)) +
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(y = "Predicted Sale Price (log10)", x = "Sale Price (log10)") +
coord_obs_pred() +
facet_wrap(~metric_desc, ncol = 2) +
theme_minimal() +
theme(panel.spacing = unit(2, "lines"),
strip.text.x = element_text(size = 8))
%>% select(metric_name, penalty, mixture) %>% distinct() best_models
## # A tibble: 14 × 3
## metric_name penalty mixture
## <chr> <dbl> <dbl>
## 1 rmse 1.74e- 6 0.887
## 2 rsq 6.71e-10 0.112
## 3 rsq_trad 1.74e- 6 0.887
## 4 mae 1.74e- 6 0.887
## 5 mpe 1.66e- 1 0.163
## 6 mape 1.74e- 6 0.887
## 7 smape 1.74e- 6 0.887
## 8 mase 1.74e- 6 0.887
## 9 ccc 6.71e-10 0.112
## 10 rpiq 6.71e-10 0.112
## 11 rpd 6.71e-10 0.112
## 12 huber_loss 1.74e- 6 0.887
## 13 huber_loss_pseudo 1.74e- 6 0.887
## 14 iic 3.81e- 4 0.552