7.2 Demonstration
library(tidyverse)
library(tidymodels)
tidymodels_prefer()
set.seed(123)
A previous Tidy Tuesday dataset is used for demonstration. References:
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-28/sf_trees.csv') sf_trees
::kable(head(sf_trees, 10)) %>% kableExtra::scroll_box(width = '100%') kableExtra
tree_id | legal_status | species | address | site_order | site_info | caretaker | date | dbh | plot_size | latitude | longitude |
---|---|---|---|---|---|---|---|---|---|---|---|
53719 | Permitted Site | Tree(s) :: | 2963 Webster St | 1 | Sidewalk: Curb side : Cutout | Private | 1955-09-19 | NA | NA | 37.79787 | -122.4341 |
30313 | Permitted Site | Tree(s) :: | 501 Arkansas St | 3 | Sidewalk: Curb side : Cutout | Private | 1955-10-20 | NA | NA | 37.75984 | -122.3981 |
30312 | Permitted Site | Tree(s) :: | 501 Arkansas St | 2 | Sidewalk: Curb side : Cutout | Private | 1955-10-20 | NA | NA | 37.75984 | -122.3981 |
30314 | DPW Maintained | Pittosporum undulatum :: Victorian Box | 501 Arkansas St | 1 | Sidewalk: Curb side : Cutout | Private | 1955-10-20 | 16 | NA | 37.75977 | -122.3981 |
30315 | Permitted Site | Acacia melanoxylon :: Blackwood Acacia | 1190 Sacramento St | 5 | Sidewalk: Curb side : Cutout | Private | 1955-10-24 | NA | NA | 37.79265 | -122.4124 |
30316 | Permitted Site | Acacia melanoxylon :: Blackwood Acacia | 1190 Sacramento St | 6 | Sidewalk: Curb side : Cutout | Private | 1955-10-24 | NA | NA | 37.79265 | -122.4124 |
48435 | Permitted Site | Tree(s) :: | 1190 Sacramento St | 4 | Sidewalk: Curb side : Cutout | Private | 1955-10-24 | NA | NA | 37.79265 | -122.4124 |
30319 | Permitted Site | Magnolia grandiflora :: Southern Magnolia | 867 25th Ave | 2 | Sidewalk: Curb side : Cutout | Private | 1955-12-13 | NA | NA | 37.77319 | -122.4843 |
30318 | Permitted Site | Magnolia grandiflora :: Southern Magnolia | 867 25th Ave | 1 | Sidewalk: Curb side : Cutout | Private | 1955-12-13 | NA | NA | 37.77319 | -122.4843 |
30320 | Permitted Site | Corymbia ficifolia :: Red Flowering Gum | 867 25th Ave | 3 | Sidewalk: Curb side : Cutout | Private | 1955-12-13 | NA | NA | 37.77319 | -122.4843 |
The goal will be to predict dbh which means diameter at breast height.
7.2.1 Some data exploration and cleaning
::kable(skimr::skim(sf_trees)) %>% kableExtra::scroll_box(width = '100%') kableExtra
skim_type | skim_variable | n_missing | complete_rate | Date.min | Date.max | Date.median | Date.n_unique | character.min | character.max | character.empty | character.n_unique | character.whitespace | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | date | 124610 | 0.3543088 | 1955-09-19 | 2020-01-25 | 2001-06-12 | 7404 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
character | legal_status | 54 | 0.9997202 | NA | NA | NA | NA | 7 | 28 | 0 | 9 | 0 | NA | NA | NA | NA | NA | NA | NA | NA |
character | species | 0 | 1.0000000 | NA | NA | NA | NA | 2 | 81 | 0 | 571 | 0 | NA | NA | NA | NA | NA | NA | NA | NA |
character | address | 1487 | 0.9922948 | NA | NA | NA | NA | 1 | 40 | 0 | 85909 | 0 | NA | NA | NA | NA | NA | NA | NA | NA |
character | site_info | 0 | 1.0000000 | NA | NA | NA | NA | 1 | 33 | 0 | 31 | 0 | NA | NA | NA | NA | NA | NA | NA | NA |
character | caretaker | 0 | 1.0000000 | NA | NA | NA | NA | 3 | 23 | 0 | 22 | 0 | NA | NA | NA | NA | NA | NA | NA | NA |
character | plot_size | 50013 | 0.7408478 | NA | NA | NA | NA | 1 | 23 | 0 | 524 | 0 | NA | NA | NA | NA | NA | NA | NA | NA |
numeric | tree_id | 0 | 1.0000000 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 126529.214071 | 7.931704e+04 | 1.0000 | 52601.50000 | 120862.00000 | 202607.50000 | 261546.00000 | ▇▆▆▆▇ |
numeric | site_order | 1634 | 0.9915331 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 4.579118 | 1.251574e+01 | -50.0000 | 1.00000 | 2.00000 | 4.00000 | 501.00000 | ▇▁▁▁▁ |
numeric | dbh | 41819 | 0.7833066 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 9.953767 | 2.936408e+01 | 0.0000 | 3.00000 | 7.00000 | 12.00000 | 9999.00000 | ▇▁▁▁▁ |
numeric | latitude | 2832 | 0.9853254 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 37.766260 | 2.497521e-01 | 37.5090 | 37.74032 | 37.76024 | 37.77964 | 47.27022 | ▇▁▁▁▁ |
numeric | longitude | 2832 | 0.9853254 | NA | NA | NA | NA | NA | NA | NA | NA | NA | -122.445586 | 4.152907e-01 | -138.2839 | -122.45430 | -122.43140 | -122.41295 | -122.36662 | ▁▁▁▁▇ |
# DataExplorer::create_report(sf_trees)
<- sf_trees %>%
trees_cleaned rename(diam = dbh, date_planted = date) %>%
filter(!is.na(diam)) %>%
filter(!is.na(legal_status)) %>%
filter(latitude <= 40 & longitude >= -125) %>%
filter(diam <= 100 & diam > 0) %>%
filter(site_order >= 0) %>%
select(-plot_size)
::kable(skimr::skim(trees_cleaned)) %>% kableExtra::scroll_box(width = '100%') kableExtra
skim_type | skim_variable | n_missing | complete_rate | Date.min | Date.max | Date.median | Date.n_unique | character.min | character.max | character.empty | character.n_unique | character.whitespace | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | date_planted | 110867 | 0.2497378 | 1955-10-20 | 2020-01-25 | 2005-08-03 | 5990 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
character | legal_status | 0 | 1.0000000 | NA | NA | NA | NA | 7 | 28 | 0 | 9 | 0 | NA | NA | NA | NA | NA | NA | NA | NA |
character | species | 0 | 1.0000000 | NA | NA | NA | NA | 2 | 81 | 0 | 520 | 0 | NA | NA | NA | NA | NA | NA | NA | NA |
character | address | 3 | 0.9999797 | NA | NA | NA | NA | 1 | 35 | 0 | 71086 | 0 | NA | NA | NA | NA | NA | NA | NA | NA |
character | site_info | 0 | 1.0000000 | NA | NA | NA | NA | 1 | 33 | 0 | 26 | 0 | NA | NA | NA | NA | NA | NA | NA | NA |
character | caretaker | 0 | 1.0000000 | NA | NA | NA | NA | 3 | 23 | 0 | 22 | 0 | NA | NA | NA | NA | NA | NA | NA | NA |
numeric | tree_id | 0 | 1.0000000 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 138935.593229 | 8.043974e+04 | 1.0000 | 66037.5000 | 139248.00000 | 211074.50000 | 261546.00000 | ▇▃▆▆▇ |
numeric | site_order | 0 | 1.0000000 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 4.292466 | 8.770121e+00 | 0.0000 | 1.0000 | 2.00000 | 4.00000 | 168.00000 | ▇▁▁▁▁ |
numeric | diam | 0 | 1.0000000 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 9.946640 | 9.949371e+00 | 1.0000 | 3.0000 | 7.00000 | 12.00000 | 100.00000 | ▇▁▁▁▁ |
numeric | latitude | 0 | 1.0000000 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 37.760650 | 2.461270e-02 | 37.5090 | 37.7404 | 37.76128 | 37.78089 | 37.80902 | ▁▁▁▅▇ |
numeric | longitude | 0 | 1.0000000 | NA | NA | NA | NA | NA | NA | NA | NA | NA | -122.434344 | 3.030750e-02 | -122.5113 | -122.4538 | -122.43113 | -122.41216 | -122.36662 | ▂▃▇▇▂ |
ggplot(trees_cleaned, aes(x = diam)) +
geom_histogram() +
scale_x_log10()
ggplot(trees_cleaned, aes(x = date_planted, y = diam)) +
geom_bin2d() +
geom_smooth()