2.3 Example 2

Code for Section 2.4 at https://bookdown.org/max/FES/stroke-tour.html#stroke-exploration

Code to compare 2-way interaction models to their main effects model

a and b are two models from train()

compare_models_1way <- function(a, b, metric = a$metric[1], ...) {
  mods <- list(a, b)
  rs <- resamples(mods)
  diffs <- diff(rs, metric = metric[1], ...)
  diffs$statistics[[1]][[1]]
}

risk_preds is contained in the original data file and has the predictor names for the risk related variables

2.3.1 Create a “null model” with no predictors to get baseline performance

Compare the models with single predictors to the risk model. These data make

https://bookdown.org/max/FES/stroke-tour.html#tab:stroke-strokeRiskAssociations

VC_preds and risk_preds contain the predictor names for different sets.

one_predictor_res <- 
  data.frame(Predictor = c(VC_preds, risk_preds), 
             Improvement = NA,
             Pvalue = NA,
             ROC = NA,
             stringsAsFactors = FALSE)


for (i in 1:nrow(one_predictor_res)) {
  set.seed(63331)
  var_mod <- train(Stroke ~ ., 
                   data = stroke_train[, c("Stroke", one_predictor_res$Predictor[i])], 
                   method = "glm", 
                   metric = "ROC",
                   trControl = ctrl)  
  tmp_diff <- compare_models_1way(var_mod, 
                                  null_mod, 
                                  alternative = "greater")
  one_predictor_res$ROC[i] <- getTrainPerf(var_mod)[1, "TrainROC"]
  one_predictor_res$Improvement[i] <- tmp_diff$estimate
  one_predictor_res$Pvalue[i] <- tmp_diff$p.value
}

2.3.2 With Tidymodels

attempting this with tidymodels, not exactly the same need to make some sort of resample object to feed workflow_map

model_folds <- vfold_cv(tidy_training, v = 10, repeats = 5)
# model_boots <- bootstraps(tidy_training, times = 50) #want to bootstrap this?

#defining the null model
null_class_model <- 
    null_model() %>%
    set_engine("parsnip") %>%
    set_mode("classification")

null_model <- workflow_set(preproc = c(Stroke ~ .), 
                           models = list(null_mod = null_class_model))

# defining the model
lm_model <- 
  logistic_reg(
  mode = "classification", # outcome is a classification Stroke Y/N
  engine = "glm",   #using glm like the example
  penalty = NULL,   
  mixture = NULL        
)

# make named list of single variable formulae
single_var_formulae <-
    c(VC_preds, risk_preds) %>%
    paste0("Stroke ~ ", .) %>%
    set_names(., c(VC_preds, risk_preds)) %>%
    as.list() %>%
    map(., as.formula)

Create the workflow set, all of our models use the same type of model and input data

single_var_models <- workflow_set(preproc = single_var_formulae, 
                                     models = list(lm = lm_model))

all_models <- bind_rows(null_model, single_var_models)

# control grid/resamples allow processing of resampled data and parallel processing
# here we are asking only to save the predictions
control <- control_resamples(save_pred = TRUE)

doParallel::registerDoParallel()
all_models1 <- 
    all_models %>%
    #map over each model and its resamples, use the control parameters, and be noisy
    workflow_map(., 
                 resamples = model_folds, 
                 control = control, 
                 verbose = TRUE)

# save(all_models1, file = "data/all_models1.RData", compress = "xz")

# per model what are the predictions, summmarize over resamples
collect_predictions(all_models1, summarize = TRUE)

## # A tibble: 2,492 × 9
##    wflow_id         .config   preproc model  .row Stroke .pred_N .pred_Y .pred…¹
##    <chr>            <fct>     <chr>   <chr> <int> <fct>    <dbl>   <dbl> <fct>  
##  1 formula_null_mod Preproce… formula null…     1 N        0.495   0.505 Y      
##  2 formula_null_mod Preproce… formula null…     2 N        0.478   0.522 Y      
##  3 formula_null_mod Preproce… formula null…     3 N        0.495   0.505 Y      
##  4 formula_null_mod Preproce… formula null…     4 N        0.475   0.525 Y      
##  5 formula_null_mod Preproce… formula null…     5 N        0.485   0.515 Y      
##  6 formula_null_mod Preproce… formula null…     6 N        0.498   0.502 Y      
##  7 formula_null_mod Preproce… formula null…     7 N        0.488   0.512 Y      
##  8 formula_null_mod Preproce… formula null…     8 N        0.485   0.515 Y      
##  9 formula_null_mod Preproce… formula null…     9 N        0.491   0.509 Y      
## 10 formula_null_mod Preproce… formula null…    10 N        0.482   0.518 Y      
## # … with 2,482 more rows, and abbreviated variable name ¹.pred_class

# per model what are the outputs in terms of fit (auc_roc) and accuracy 
collect_metrics(all_models1)

## # A tibble: 56 × 9
##    wflow_id         .config    preproc model .metric .esti…¹  mean     n std_err
##    <chr>            <fct>      <chr>   <chr> <chr>   <chr>   <dbl> <int>   <dbl>
##  1 formula_null_mod Preproces… formula null… accura… binary  0.378    50  0.0131
##  2 formula_null_mod Preproces… formula null… roc_auc binary  0.5      50  0     
##  3 CALCVol_lm       Preproces… formula logi… accura… binary  0.388    50  0.0183
##  4 CALCVol_lm       Preproces… formula logi… roc_auc binary  0.457    50  0.0291
##  5 CALCVolProp_lm   Preproces… formula logi… accura… binary  0.441    50  0.0196
##  6 CALCVolProp_lm   Preproces… formula logi… roc_auc binary  0.502    50  0.0260
##  7 MATXVol_lm       Preproces… formula logi… accura… binary  0.346    50  0.0177
##  8 MATXVol_lm       Preproces… formula logi… roc_auc binary  0.380    50  0.0277
##  9 MATXVolProp_lm   Preproces… formula logi… accura… binary  0.415    50  0.0222
## 10 MATXVolProp_lm   Preproces… formula logi… roc_auc binary  0.516    50  0.0285
## # … with 46 more rows, and abbreviated variable name ¹.estimator

# plot the output to show which individual parameters have the most impact
autoplot(
   all_models1,
   rank_metric = "roc_auc",  # <- how to order models
   metric = "roc_auc",       # <- which metric to visualize
   select_best = FALSE     # <- one point per workflow
) +
   geom_text(aes(y = mean - 1/10*mean, 
                 label = wflow_id),
             angle = 90, hjust = 1) +
   theme(legend.position = "none")

# per model, sort by auc_roc
all_models1 %>%
rank_results() %>% 
   filter(.metric == "roc_auc")

## # A tibble: 28 × 9
##    wflow_id              .config .metric  mean std_err     n prepr…¹ model  rank
##    <chr>                 <fct>   <chr>   <dbl>   <dbl> <int> <chr>   <chr> <int>
##  1 MaxRemodelingRatio_lm Prepro… roc_auc 0.682  0.0300    50 formula logi…     1
##  2 MaxMaxWallThickness_… Prepro… roc_auc 0.669  0.0276    50 formula logi…     2
##  3 MaxStenosisByArea_lm  Prepro… roc_auc 0.651  0.0256    50 formula logi…     3
##  4 MaxStenosisByDiamete… Prepro… roc_auc 0.640  0.0281    50 formula logi…     4
##  5 MaxDilationByArea_lm  Prepro… roc_auc 0.625  0.0302    50 formula logi…     5
##  6 MaxLRNCArea_lm        Prepro… roc_auc 0.622  0.0281    50 formula logi…     6
##  7 LRNCVolProp_lm        Prepro… roc_auc 0.598  0.0260    50 formula logi…     7
##  8 MaxMATXArea_lm        Prepro… roc_auc 0.595  0.0298    50 formula logi…     8
##  9 MaxWallArea_lm        Prepro… roc_auc 0.591  0.0300    50 formula logi…     9
## 10 CoronaryArteryDiseas… Prepro… roc_auc 0.574  0.0208    50 formula logi…    10
## # … with 18 more rows, and abbreviated variable name ¹preprocessor

# Data in table 2.3
# https://bookdown.org/max/FES/stroke-tour.html#tab:stroke-strokeRiskAssociations

one_predictor_res %>% 
  dplyr::filter(Predictor %in% risk_preds) %>% 
  arrange(Pvalue)

##                     Predictor Improvement       Pvalue      ROC
## 1       CoronaryArteryDisease    0.079000 0.0002957741 0.579000
## 2             DiabetesHistory    0.066500 0.0003019908 0.566500
## 3         HypertensionHistory    0.065000 0.0004269919 0.565000
## 4                         age    0.082975 0.0011215271 0.582975
## 5          AtrialFibrillation    0.044000 0.0013131334 0.544000
## 6              SmokingHistory   -0.015000 0.7372947489 0.485000
## 7                         sex   -0.029000 0.8970159364 0.471000
## 8 HypercholesterolemiaHistory   -0.101000 0.9999998987 0.399000

# Figure 2.4
# https://bookdown.org/max/FES/stroke-tour.html#fig:stroke-vascuCAPAssocations

vc_pred <- 
  recipe(Stroke ~ ., data = stroke_train %>% dplyr::select(Stroke, !!!VC_preds)) %>% 
    step_YeoJohnson(all_predictors()) %>% 
    prep(stroke_train %>% dplyr::select(Stroke, !!!VC_preds)) %>% 
    bake(., new_data = NULL) %>% 
    gather(Predictor, value, -Stroke)
vc_pred%>%head

## # A tibble: 6 × 3
##   Stroke Predictor value
##   <fct>  <chr>     <dbl>
## 1 N      CALCVol    9.51
## 2 N      CALCVol    4.90
## 3 N      CALCVol    7.63
## 4 Y      CALCVol   13.2 
## 5 N      CALCVol    6.93
## 6 N      CALCVol    2.27

pred_max <- 
  vc_pred %>% 
  group_by(Predictor) %>% 
  summarize(max_val = max(value)) %>% 
  inner_join(one_predictor_res %>% dplyr::select(Pvalue, Predictor)) %>% 
  mutate(
    x = 1.5, 
    value = 1.25 * max_val,
    label = paste0("p-value: ", format.pval(Pvalue, digits = 2, sci = FALSE, eps = .0001))
  )

## Joining, by = "Predictor"

pred_max%>%head

## # A tibble: 6 × 6
##   Predictor     max_val   Pvalue     x     value label           
##   <chr>           <dbl>    <dbl> <dbl>     <dbl> <chr>           
## 1 CALCVol        15.7   0.000872   1.5    19.6   p-value: 0.00087
## 2 CALCVolProp     0.440 0.000856   1.5     0.550 p-value: 0.00086
## 3 LRNCVol        23.2   1.00       1.5    29.0   p-value: 1.00000
## 4 LRNCVolProp     0.495 0.207      1.5     0.618 p-value: 0.20723
## 5 MATXVol     41681.    1.00       1.5 52101.    p-value: 1.00000
## 6 MATXVolProp     0.870 1.00       1.5     1.09  p-value: 1.00000

new_order <- pred_max$Predictor[order(pred_max$Pvalue)]

vc_pred <- 
  vc_pred %>% 
  mutate(Predictor = factor(Predictor, levels = new_order))

pred_max <- 
  pred_max %>% 
  mutate(Predictor = factor(Predictor, levels = new_order))

fig_2_4 <-
  ggplot(vc_pred, aes(x = Stroke, y = value)) + 
  geom_boxplot() + 
  geom_point(alpha = 0.3, cex = .5) + 
  geom_text(data = pred_max, aes(x = x, label = label), size = 3) + 
  facet_wrap(~Predictor, scales = "free_y") + 
  ylab("")
fig_2_4

# Figure 2.5
# https://bookdown.org/max/FES/stroke-tour.html#fig:stroke-maxRemodelingRatioROC

fig_2_5 <- 
  roc_curve(stroke_train, Stroke, MaxRemodelingRatio) %>% 
  # used opposite values
  ggplot(aes(x = specificity, y = 1-sensitivity)) + 
  geom_abline(alpha = .5, lty = 2) +
  geom_path()

fig_2_5

2.3.3 Interaction exploration

Here they create all the pairs of all of the image analysis components there are 171 interactions

pairs <- 
  combn(VC_preds, 2) %>% 
  t() %>% 
  as.data.frame(stringsAsFactors = FALSE) %>% 
  mutate(
    Improvement = NA,
    Pvalue = NA,
    ROC = NA
  )

Run comparisons with caret

retained_pairs <- 
  pairs1 %>% 
  dplyr::filter(ROC > 0.5  & Pvalue <= 0.2)
retained_pairs%>%head

##        V1                    V2 Improvement      Pvalue     ROC
## 1 MATXVol   MaxMaxWallThickness     0.07635 0.001903838 0.65435
## 2 MATXVol    MaxRemodelingRatio     0.11085 0.001345363 0.55150
## 3 MATXVol     MaxStenosisByArea     0.02325 0.086210254 0.60960
## 4 MATXVol MaxStenosisByDiameter     0.01930 0.197228736 0.61550
## 5 LRNCVol           MaxMATXArea     0.02590 0.152003535 0.55690
## 6 LRNCVol    MaxRemodelingRatio     0.03345 0.119774508 0.59510

# Figure 2.6
# https://bookdown.org/max/FES/stroke-tour.html#fig:stroke-interactionScreening

vol_plot <- 
  pairs1 %>% 
  dplyr::filter(ROC > 0.5) %>%
  mutate(Term = paste(V1, "by", V2, "\nROC:", round(ROC, 2))) %>%
  ggplot(aes(x = Improvement, y = -log10(Pvalue))) + 
  xlab("Improvement") +
  geom_point(alpha = .2, aes(size = ROC, text = Term))

vol_plot <- ggplotly(vol_plot, tooltip = "Term")
vol_plot

Create interaction formula of things that matter most

int_form <- 
  pairs1 %>% 
  dplyr::filter(ROC > 0.5  & Pvalue <= 0.2 & Improvement > 0) %>% 
  mutate(form  = paste0(V1, ":", V2)) %>% 
  pull(form) %>% 
  paste(collapse = "+")

int_form <- paste("~", int_form)
int_form <- as.formula(int_form)
int_form%>%head

## ~MATXVol:MaxMaxWallThickness + MATXVol:MaxRemodelingRatio + MATXVol:MaxStenosisByArea + 
##     MATXVol:MaxStenosisByDiameter + LRNCVol:MaxMATXArea + LRNCVol:MaxRemodelingRatio + 
##     MaxCALCAreaProp:MaxMATXAreaProp + MaxCALCAreaProp:MaxRemodelingRatio + 
##     MaxDilationByArea:MaxMaxWallThickness + MaxDilationByArea:MaxRemodelingRatio + 
##     MaxMATXAreaProp:MaxLRNCArea + MaxMATXAreaProp:MaxRemodelingRatio + 
##     MaxMATXAreaProp:MaxWallArea + MaxMaxWallThickness:MaxStenosisByArea + 
##     MaxMaxWallThickness:WallVol + MaxRemodelingRatio:MaxWallArea + 
##     MaxRemodelingRatio:WallVol + MaxStenosisByArea:WallVol

This part of the script is to work through all of of the potential models:

original risk set alone
imaging predictors alone
risk and imaging predictors together
imaging predictors and interactions of imaging predictors, and
risk, imaging predictors, and interactions of imaging predictors

All the models are run below.

risk_train <- 
  stroke_train %>%
  dplyr::select(one_of(risk_preds), Stroke)

risk_train%>%head

##   age sex SmokingHistory AtrialFibrillation CoronaryArteryDisease
## 1  72   1              1                  0                     0
## 2  76   1              1                  0                     0
## 4  72   0              0                  0                     0
## 5  61   1              1                  0                     0
## 7  65   1              0                  0                     0
## 8  64   1              1                  0                     1
##   DiabetesHistory HypercholesterolemiaHistory HypertensionHistory Stroke
## 1               0                           0                   1      N
## 2               1                           1                   1      N
## 4               0                           0                   0      N
## 5               1                           1                   1      Y
## 7               0                           0                   1      N
## 8               0                           1                   1      N

image_train <- 
  stroke_train %>%
  dplyr::select(one_of(VC_preds), Stroke)

image_train%>%head

##      CALCVol CALCVolProp  MATXVol MATXVolProp   LRNCVol LRNCVolProp MaxCALCArea
## 1 235.252599 0.070442702 3156.835   0.7599582 224.87171  0.09108513   12.350494
## 2  31.433595 0.016164769 3032.861   0.8133063 368.56066  0.13398944    7.130660
## 4 113.404823 0.038081488 3835.220   0.7825256 321.15893  0.08303659   16.286916
## 5 780.823789 0.213432061 3518.877   0.7610895 140.51735  0.03206536   63.350869
## 7  84.055774 0.041383842 2990.273   0.7498691 293.26992  0.07539753   17.583561
## 8   5.644322 0.002824946 3359.323   0.8492801  55.76888  0.01983567    2.841252
##   MaxCALCAreaProp MaxDilationByArea MaxMATXArea MaxMATXAreaProp MaxLRNCArea
## 1      0.36576842         520.98259    71.24743       0.9523705   21.686815
## 2      0.21124686          91.72005    27.21084       0.9455539    6.434661
## 4      0.40881121         270.96930    38.12211       0.9459098    5.705054
## 5      0.57620574        2270.45120   341.12089       0.9691989    6.046787
## 7      0.32150685          95.15505    56.57457       0.9213197    7.213451
## 8      0.07734609         298.42121    33.92709       0.9612049    4.595196
##   MaxLRNCAreaProp MaxMaxWallThickness MaxRemodelingRatio MaxStenosisByArea
## 1       0.4295781            2.409943           5.697931          18.99554
## 2       0.2815101            2.540334           1.739927          30.23761
## 4       0.1547786            3.708515           2.831636          33.93906
## 5       0.1870965            6.115838          15.647750          34.30985
## 7       0.2169263            3.975168           1.912069          36.59954
## 8       0.2053274            2.581908           2.181675          40.31766
##   MaxWallArea  WallVol MaxStenosisByDiameter Stroke
## 1   106.20676 4192.170              10.54411      N
## 2    33.36714 3917.040              18.64620      N
## 4    55.34671 4935.327              19.73511      N
## 5   426.47858 4909.504              20.28832      Y
## 7    59.82696 4045.053              49.29705      N
## 8    34.79742 3960.832              30.50857      N

fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...))
internal_ctrl = trainControl(method = "none", classProbs = TRUE,
                             allowParallel = FALSE)
lrFuncsNew <- caretFuncs  
lrFuncsNew$summary <- fiveStats
rfeCtrl <- rfeControl(functions = lrFuncsNew,
                      method = "repeatedcv",
                      repeats = 5,
                      rerank = FALSE,
                      returnResamp = "all",
                      saveDetails = TRUE,
                      verbose = FALSE)

RFE procedure using risk predictors

All pair-wise interactions.

risk_int_filtered_recipe <-
  recipe(Stroke ~ ., data = risk_train) %>%
  step_interact(~ all_predictors():all_predictors()) %>%
  step_corr(all_predictors(), threshold = 0.75) %>% 
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
risk_int_filtered_rfe <- rfe(
  risk_int_filtered_recipe,
  data = risk_train,
  sizes = 1:36,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

# Main effects
risk_main_filtered_recipe <-
  recipe(Stroke ~ ., data = risk_train) %>%
  step_corr(all_predictors(), threshold = 0.75) %>% 
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
risk_main_filtered_rfe <- rfe(
  risk_main_filtered_recipe,
  data = risk_train,
  sizes = 1:8,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

RFE procedure using imaging predictors.

img_int_filtered_recipe <-
  recipe(Stroke ~ ., data = image_train) %>%
  step_interact(int_form)  %>%
  step_corr(all_predictors(), threshold = 0.75) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_YeoJohnson(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
img_int_filtered_rfe <- rfe(
  img_int_filtered_recipe,
  data = image_train,
  sizes = 1:35,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

img_main_filtered_recipe <-
  recipe(Stroke ~ ., data = image_train)  %>%
  step_corr(all_predictors(), threshold = 0.75) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_YeoJohnson(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
img_main_filtered_rfe <- rfe(
  img_main_filtered_recipe,
  data = image_train,
  sizes = 1:19,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

both_int_filtered_recipe <-
  recipe(Stroke ~ ., data = stroke_train) %>%
  step_interact(int_form)  %>%
  step_corr(all_predictors(), threshold = 0.75) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_YeoJohnson(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
both_int_filtered_rfe <- rfe(
  both_int_filtered_recipe,
  data = stroke_train,
  sizes = 1:44,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

both_main_filtered_recipe <-
  recipe(Stroke ~ ., data = stroke_train)  %>%
  step_corr(all_predictors(), threshold = 0.75) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_YeoJohnson(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
both_main_filtered_rfe <- rfe(
  both_main_filtered_recipe,
  data = stroke_train,
  sizes = 1:28,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

risk_int_recipe <-
  recipe(Stroke ~ ., data = risk_train) %>%
  step_interact(~ all_predictors():all_predictors()) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
risk_int_rfe <- rfe(
  risk_int_recipe,
  data = risk_train,
  sizes = 1:36,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

risk_main_recipe <-
  recipe(Stroke ~ ., data = risk_train) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
risk_main_rfe <- rfe(
  risk_main_recipe,
  data = risk_train,
  sizes = 1:8,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

img_int_recipe <-
  recipe(Stroke ~ ., data = image_train) %>%
  step_interact(int_form)  %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_YeoJohnson(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
img_int_rfe <- rfe(
  img_int_recipe,
  data = image_train,
  sizes = 1:35,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

img_main_recipe <-
  recipe(Stroke ~ ., data = image_train)  %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_YeoJohnson(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
img_main_rfe <- rfe(
  img_main_recipe,
  data = image_train,
  sizes = 1:19,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

both_int_recipe <-
  recipe(Stroke ~ ., data = stroke_train) %>%
  step_interact(int_form)  %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_YeoJohnson(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
both_int_rfe <- rfe(
  both_int_recipe,
  data = stroke_train,
  sizes = 1:44,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

both_main_recipe <-
  recipe(Stroke ~ ., data = stroke_train)  %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_YeoJohnson(all_predictors()) %>%
  step_zv(all_predictors())

set.seed(63331)
both_main_rfe <- rfe(
  both_main_recipe,
  data = stroke_train,
  sizes = 1:28,
  rfeControl = rfeCtrl,
  metric = "ROC",
  ## train options
  method = "glm",
  trControl = internal_ctrl
)

format_data <- function(x, lab, int = FALSE) {
  dat <- 
    x %>% 
    pluck("results") %>% 
    mutate(Predictors = !!lab) %>% 
    dplyr::select(ROC, Variables, Predictors, Variables, Num_Resamples) %>% 
    mutate(Model = "Main Effects")
  if (int)
    dat$Model <- "Interactions"
  dat
  
}

filtered_dat <- 
  bind_rows(
    format_data(risk_main_filtered_rfe, lab = "Risk Predictors"),
    format_data(risk_int_filtered_rfe, lab = "Risk Predictors", TRUE),
    format_data(img_main_filtered_rfe, lab = "Imaging Predictors"),
    format_data(img_int_filtered_rfe, lab = "Imaging Predictors", TRUE),
    format_data(both_main_filtered_rfe, lab = "All Predictors"),
    format_data(both_int_filtered_rfe, lab = "All Predictors", TRUE)
  ) %>% 
  mutate(
    Predictors = factor(
      Predictors, 
      levels = c("Risk Predictors", "Imaging Predictors", "All Predictors")
    ),
    Model = factor(Model, levels = c("Main Effects", "Interactions")),
    Filtering = "Correlation Filter"
  )

unfiltered_dat <- 
  bind_rows(
    format_data(risk_main_rfe, lab = "Risk Predictors"),
    format_data(risk_int_rfe, lab = "Risk Predictors", TRUE),
    format_data(img_main_rfe, lab = "Imaging Predictors"),
    format_data(img_int_rfe, lab = "Imaging Predictors", TRUE),
    format_data(both_main_rfe, lab = "All Predictors"),
    format_data(both_int_rfe, lab = "All Predictors", TRUE)
  ) %>% 
  mutate(
    Predictors = factor(
      Predictors, 
      levels = c("Risk Predictors", "Imaging Predictors", "All Predictors")
    ),
    Model = factor(Model, levels = c("Main Effects", "Interactions")),
    Filtering = "No Filter"
  )

rfe_data <- 
  bind_rows(filtered_dat, unfiltered_dat) %>% 
  mutate(
    Filtering = factor(Filtering, levels = c("No Filter", "Correlation Filter"))
  )

# https://bookdown.org/max/FES/predictive-modeling-across-sets.html#fig:stroke-rfe-res
ggplot(rfe_data, aes(x = Variables, y = ROC, col = Model)) +
  geom_point(size = 0.75) + 
  geom_line() + 
  facet_grid(Filtering ~ Predictors) + 
  scale_color_manual(values = c("#6A3D9A", "#CAB2D6"))

# https://bookdown.org/max/FES/predictive-modeling-across-sets.html#tab:stroke-rfe-tab
rfe_tab <-
  img_main_filtered_rfe %>% 
  pluck("variables") %>% 
  filter(Variables == img_main_filtered_rfe$optsize) %>% 
  group_by(var) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  mutate(final = ifelse(var %in% img_main_filtered_rfe$optVariables, "Yes", "No")) %>% 
  ungroup()