14.7 Validation

14.7.1 Confusion Matrices

penguins <- penguins %>% 
  mutate(class_1 = predict(naive_model_1, newdata = .),
         class_2 = predict(naive_model_2, newdata = .))
set.seed(84735)
penguins %>% 
  sample_n(4) %>% 
  select(bill_length_mm, flipper_length_mm, species, class_1, class_2) %>% 
  rename(bill = bill_length_mm, flipper = flipper_length_mm)
## # A tibble: 4 × 5
##    bill flipper species   class_1 class_2  
##   <dbl>   <int> <fct>     <fct>   <fct>    
## 1  47.5     199 Chinstrap Gentoo  Chinstrap
## 2  40.9     214 Gentoo    Adelie  Gentoo   
## 3  41.3     194 Adelie    Adelie  Adelie   
## 4  38.5     190 Adelie    Adelie  Adelie
# Confusion matrix for naive_model_1
penguins %>% 
  tabyl(species, class_1) %>% 
  adorn_percentages("row") %>% 
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()
##    species       Adelie Chinstrap       Gentoo
##     Adelie 95.39% (145) 0.00% (0)  4.61%   (7)
##  Chinstrap  5.88%   (4) 8.82% (6) 85.29%  (58)
##     Gentoo  6.45%   (8) 4.84% (6) 88.71% (110)
  • accuracy: 76 percent
  • 85 percent of Chinstap penguins are misclassified as Gentoo!
# Confusion matrix for naive_model_2
penguins %>% 
  tabyl(species, class_2) %>% 
  adorn_percentages("row") %>% 
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()
##    species       Adelie   Chinstrap       Gentoo
##     Adelie 96.05% (146)  2.63%  (4)  1.32%   (2)
##  Chinstrap  7.35%   (5) 86.76% (59)  5.88%   (4)
##     Gentoo  0.81%   (1)  0.81%  (1) 98.39% (122)
  • accuracy: 95 percent

14.7.2 Cross-Validation

# 10-fold cross-validation
set.seed(84735)
cv_model_2 <- naive_classification_summary_cv(
  model = naive_model_2, data = penguins, y = "species", k = 10)
cv_model_2$cv
##    species       Adelie   Chinstrap       Gentoo
##     Adelie 96.05% (146)  2.63%  (4)  1.32%   (2)
##  Chinstrap  7.35%   (5) 86.76% (59)  5.88%   (4)
##     Gentoo  0.81%   (1)  0.81%  (1) 98.39% (122)