7.2 Demonstration

library(tidyverse)
library(tidymodels)
tidymodels_prefer()
set.seed(123)

A previous Tidy Tuesday dataset is used for demonstration. References:

sf_trees <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-28/sf_trees.csv')
kableExtra::kable(head(sf_trees, 10)) %>% kableExtra::scroll_box(width = '100%')
tree_id legal_status species address site_order site_info caretaker date dbh plot_size latitude longitude
53719 Permitted Site Tree(s) :: 2963 Webster St 1 Sidewalk: Curb side : Cutout Private 1955-09-19 NA NA 37.79787 -122.4341
30313 Permitted Site Tree(s) :: 501 Arkansas St 3 Sidewalk: Curb side : Cutout Private 1955-10-20 NA NA 37.75984 -122.3981
30312 Permitted Site Tree(s) :: 501 Arkansas St 2 Sidewalk: Curb side : Cutout Private 1955-10-20 NA NA 37.75984 -122.3981
30314 DPW Maintained Pittosporum undulatum :: Victorian Box 501 Arkansas St 1 Sidewalk: Curb side : Cutout Private 1955-10-20 16 NA 37.75977 -122.3981
30315 Permitted Site Acacia melanoxylon :: Blackwood Acacia 1190 Sacramento St 5 Sidewalk: Curb side : Cutout Private 1955-10-24 NA NA 37.79265 -122.4124
30316 Permitted Site Acacia melanoxylon :: Blackwood Acacia 1190 Sacramento St 6 Sidewalk: Curb side : Cutout Private 1955-10-24 NA NA 37.79265 -122.4124
48435 Permitted Site Tree(s) :: 1190 Sacramento St 4 Sidewalk: Curb side : Cutout Private 1955-10-24 NA NA 37.79265 -122.4124
30319 Permitted Site Magnolia grandiflora :: Southern Magnolia 867 25th Ave 2 Sidewalk: Curb side : Cutout Private 1955-12-13 NA NA 37.77319 -122.4843
30318 Permitted Site Magnolia grandiflora :: Southern Magnolia 867 25th Ave 1 Sidewalk: Curb side : Cutout Private 1955-12-13 NA NA 37.77319 -122.4843
30320 Permitted Site Corymbia ficifolia :: Red Flowering Gum 867 25th Ave 3 Sidewalk: Curb side : Cutout Private 1955-12-13 NA NA 37.77319 -122.4843

The goal will be to predict dbh which means diameter at breast height.

7.2.1 Some data exploration and cleaning

kableExtra::kable(skimr::skim(sf_trees)) %>% kableExtra::scroll_box(width = '100%')
skim_type skim_variable n_missing complete_rate Date.min Date.max Date.median Date.n_unique character.min character.max character.empty character.n_unique character.whitespace numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
Date date 124610 0.3543088 1955-09-19 2020-01-25 2001-06-12 7404 NA NA NA NA NA NA NA NA NA NA NA NA NA
character legal_status 54 0.9997202 NA NA NA NA 7 28 0 9 0 NA NA NA NA NA NA NA NA
character species 0 1.0000000 NA NA NA NA 2 81 0 571 0 NA NA NA NA NA NA NA NA
character address 1487 0.9922948 NA NA NA NA 1 40 0 85909 0 NA NA NA NA NA NA NA NA
character site_info 0 1.0000000 NA NA NA NA 1 33 0 31 0 NA NA NA NA NA NA NA NA
character caretaker 0 1.0000000 NA NA NA NA 3 23 0 22 0 NA NA NA NA NA NA NA NA
character plot_size 50013 0.7408478 NA NA NA NA 1 23 0 524 0 NA NA NA NA NA NA NA NA
numeric tree_id 0 1.0000000 NA NA NA NA NA NA NA NA NA 126529.214071 7.931704e+04 1.0000 52601.50000 120862.00000 202607.50000 261546.00000 ▇▆▆▆▇
numeric site_order 1634 0.9915331 NA NA NA NA NA NA NA NA NA 4.579118 1.251574e+01 -50.0000 1.00000 2.00000 4.00000 501.00000 ▇▁▁▁▁
numeric dbh 41819 0.7833066 NA NA NA NA NA NA NA NA NA 9.953767 2.936408e+01 0.0000 3.00000 7.00000 12.00000 9999.00000 ▇▁▁▁▁
numeric latitude 2832 0.9853254 NA NA NA NA NA NA NA NA NA 37.766260 2.497521e-01 37.5090 37.74032 37.76024 37.77964 47.27022 ▇▁▁▁▁
numeric longitude 2832 0.9853254 NA NA NA NA NA NA NA NA NA -122.445586 4.152907e-01 -138.2839 -122.45430 -122.43140 -122.41295 -122.36662 ▁▁▁▁▇
# DataExplorer::create_report(sf_trees)
trees_cleaned <- sf_trees %>%
  rename(diam = dbh, date_planted = date) %>%
  filter(!is.na(diam)) %>%
  filter(!is.na(legal_status)) %>%
  filter(latitude <= 40 & longitude >= -125) %>%
  filter(diam <= 100 & diam > 0) %>%
  filter(site_order >= 0) %>%
  select(-plot_size)

kableExtra::kable(skimr::skim(trees_cleaned)) %>% kableExtra::scroll_box(width = '100%')
skim_type skim_variable n_missing complete_rate Date.min Date.max Date.median Date.n_unique character.min character.max character.empty character.n_unique character.whitespace numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
Date date_planted 110867 0.2497378 1955-10-20 2020-01-25 2005-08-03 5990 NA NA NA NA NA NA NA NA NA NA NA NA NA
character legal_status 0 1.0000000 NA NA NA NA 7 28 0 9 0 NA NA NA NA NA NA NA NA
character species 0 1.0000000 NA NA NA NA 2 81 0 520 0 NA NA NA NA NA NA NA NA
character address 3 0.9999797 NA NA NA NA 1 35 0 71086 0 NA NA NA NA NA NA NA NA
character site_info 0 1.0000000 NA NA NA NA 1 33 0 26 0 NA NA NA NA NA NA NA NA
character caretaker 0 1.0000000 NA NA NA NA 3 23 0 22 0 NA NA NA NA NA NA NA NA
numeric tree_id 0 1.0000000 NA NA NA NA NA NA NA NA NA 138935.593229 8.043974e+04 1.0000 66037.5000 139248.00000 211074.50000 261546.00000 ▇▃▆▆▇
numeric site_order 0 1.0000000 NA NA NA NA NA NA NA NA NA 4.292466 8.770121e+00 0.0000 1.0000 2.00000 4.00000 168.00000 ▇▁▁▁▁
numeric diam 0 1.0000000 NA NA NA NA NA NA NA NA NA 9.946640 9.949371e+00 1.0000 3.0000 7.00000 12.00000 100.00000 ▇▁▁▁▁
numeric latitude 0 1.0000000 NA NA NA NA NA NA NA NA NA 37.760650 2.461270e-02 37.5090 37.7404 37.76128 37.78089 37.80902 ▁▁▁▅▇
numeric longitude 0 1.0000000 NA NA NA NA NA NA NA NA NA -122.434344 3.030750e-02 -122.5113 -122.4538 -122.43113 -122.41216 -122.36662 ▂▃▇▇▂
ggplot(trees_cleaned, aes(x = diam)) +
  geom_histogram() +
  scale_x_log10()

ggplot(trees_cleaned, aes(x = date_planted, y = diam)) +
  geom_bin2d() +
  geom_smooth()