18.3 Implicit missing values

18.3.1 Implicit missing values

Consider a simple dataset that records the price of some stock each quarter:

stocks <- tibble::tibble(
  year  = c(2020, 2020, 2020, 2020, 2021, 2021, 2021),
  qtr   = c(   1,    2,    3,    4,    2,    3,    4),
  price = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

This dataset has two missing observations:

  • The price in the fourth quarter of 2020 is explicitly missing, because its value is NA.

  • The price for the first quarter of 2021 is implicitly missing, because it simply does not appear in the dataset.

If there is a need to make implicit missing values explicit, we can pivot the data using tidyr::pivot_wider.

wide_stocks <-  stocks |>
  tidyr::pivot_wider(
    names_from = "qtr", 
    values_from = "price"
  )

wide_stocks
## # A tibble: 2 × 5
##    year   `1`   `2`   `3`   `4`
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  2020  1.88  0.59  0.35 NA   
## 2  2021 NA     0.92  0.17  2.66

By default, making data longer using tidyr::pivot_longer preserves explicit missing values. We can drop them (make them implicit) by setting values_drop_na = TRUE.

wide_stocks |>
  tidyr::pivot_longer(
    cols = -c("year"),
    names_to = "qtr", 
    values_to = "price"
  )
## # A tibble: 8 × 3
##    year qtr   price
##   <dbl> <chr> <dbl>
## 1  2020 1      1.88
## 2  2020 2      0.59
## 3  2020 3      0.35
## 4  2020 4     NA   
## 5  2021 1     NA   
## 6  2021 2      0.92
## 7  2021 3      0.17
## 8  2021 4      2.66
wide_stocks |>
  tidyr::pivot_longer(
    cols = -c("year"),
    names_to = "qtr", 
    values_to = "price",
    values_drop_na = TRUE
  )
## # A tibble: 6 × 3
##    year qtr   price
##   <dbl> <chr> <dbl>
## 1  2020 1      1.88
## 2  2020 2      0.59
## 3  2020 3      0.35
## 4  2021 2      0.92
## 5  2021 3      0.17
## 6  2021 4      2.66

tidyr::complete() turns implicit missing values into explicit missing values based on combination values from its input columns.

stocks |>
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 7
  )
stocks |>
  tidyr::complete(
    .data[["year"]], 
    .data[["qtr"]]) |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 4
  )

Sometimes the individual variables are themselves incomplete and they is a need to provide your own data. For example, if we know that the stocks dataset is supposed to run from 2019 to 2021, we could explicitly supply those values for year.

stocks |>
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 7
  )
stocks |>
  tidyr::complete(
    `year` = 2019:2021, 
    .data[["qtr"]]) |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 4
  )

Another way to reveal implicitly missing observations is by using dplyr::anti_join. Here, four of the destinations do not have any airport metadata information.

# Get unique destination and rename to faa

dest_flights <- nycflights13::flights |> 
  dplyr::distinct(faa = .data[["dest"]])

dest_flights |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 5
  )
dest_flights |> 
  dplyr::anti_join(
    y = nycflights13::airports,
    by = dplyr::join_by("faa")
 ) |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 5
  )

Here, 722 planes do not have any planes metadata information.

# Get unique tail numbers

tailnum_flights <- nycflights13::flights |> 
  dplyr::distinct(.data[["tailnum"]])

tailnum_flights |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 5
  )
tailnum_flights |> 
  dplyr::anti_join(
    y = nycflights13::planes,
    by = dplyr::join_by("tailnum")
 ) |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 5
  )

18.3.2 dplyr::antijoin Extra

Use dplyr::anti_join to isolate rows causing dplyr::inner_join error.

Extra Weight Case:

three_penguins <- tibble::tribble(
  ~samp_id, ~species,    ~island,
  1,        "Adelie",    "Torgersen",
  2,        "Gentoo",    "Biscoe",
  3,        "Chinstrap", "Dream"
)

weight_extra <- tibble::tribble(
  ~samp_id,  ~body_mass_g,
  0,         1500,
  1,         3220,
  2,         4730,
  3,         4000,
  4,         1000,
  5,         1100
)

three_penguins |> 
  dplyr::inner_join(
    y = weight_extra,
    by = dplyr::join_by("samp_id"),
    unmatched = "error"
 ) 
## Error in `dplyr::inner_join()`:
## ! Each row of `y` must be matched by `x`.
## ℹ Row 1 of `y` was not matched.
weight_extra <- tibble::tribble(
  ~samp_id,  ~body_mass_g,
  0,         1500,
  1,         3220,
  2,         4730,
  3,         4000,
  4,         1000,
  5,         1100
)

weight_extra |> 
  dplyr::anti_join(
    y = three_penguins,
    by = dplyr::join_by("samp_id")
 ) 
## # A tibble: 3 × 2
##   samp_id body_mass_g
##     <dbl>       <dbl>
## 1       0        1500
## 2       4        1000
## 3       5        1100

Weight 3 Missing Case:

three_penguins <- tibble::tribble(
  ~samp_id, ~species,    ~island,
  1,        "Adelie",    "Torgersen",
  2,        "Gentoo",    "Biscoe",
  3,        "Chinstrap", "Dream"
)

weight_no_3 <- tibble::tribble(
  ~samp_id,  ~body_mass_g,
  1,         3220,
  2,         4730
)

three_penguins |> 
  dplyr::inner_join(
    y = weight_no_3,
    by = dplyr::join_by("samp_id"),
    unmatched = "error"
 ) 
## Error in `dplyr::inner_join()`:
## ! Each row of `x` must have a match in `y`.
## ℹ Row 3 of `x` does not have a match.
three_penguins |> 
  dplyr::anti_join(
    y = weight_no_3,
    by = dplyr::join_by("samp_id")
 ) 
## # A tibble: 1 × 3
##   samp_id species   island
##     <dbl> <chr>     <chr> 
## 1       3 Chinstrap Dream

Unfortunately cannot resolve multiple matches. Use argument both relationship = "one-to-one" and unmatched = "error" to ensure one row from x matches with exactly one row of y.

three_penguins <- tibble::tribble(
  ~samp_id, ~species,    ~island,
  1,        "Adelie",    "Torgersen",
  2,        "Gentoo",    "Biscoe",
  3,        "Chinstrap", "Dream"
)

weight_extra_2 <- tibble::tribble(
  ~samp_id,  ~body_mass_g,
  1,         3220,
  2,         4730,
  2,         4725,
  3,         4000
)

three_penguins |> 
  dplyr::inner_join(
    y = weight_extra_2,
    by = dplyr::join_by("samp_id"),
    relationship = "one-to-one",
    unmatched = "error"
 ) 
## Error in `dplyr::inner_join()`:
## ! Each row in `x` must match at most 1 row in `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.

18.3.3 Exercises

Can you find any relationship between the carrier and the rows that appear to be missing from planes ?

We first get all distinct carriers and tail numbers. We do a left join with the nycflights13::airlines so that we know what the carrier abbreviation means.

tailnum_carrier_flights <- nycflights13::flights |> 
  dplyr::distinct(.data[["tailnum"]], .data[["carrier"]]) |> 
  dplyr::arrange(.data[["carrier"]]) |> 
  dplyr::left_join(
    nycflights13::airlines,
    by = dplyr::join_by("carrier")
  )

tailnum_carrier_flights |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    filterable = TRUE,
    defaultPageSize = 5
  )

We now use dplyr::anti_join with nycflights13::planes to identify tail numbers that have no plane information.

We can see that most of them come from either AA (American Airlines Inc.) or MQ (Envoy Air)

missing_tailnum_carrier_flights <- tailnum_carrier_flights |> 
  dplyr::anti_join(
    y = nycflights13::planes,
    by = dplyr::join_by("tailnum")
 ) 

missing_tailnum_carrier_flights[["carrier"]] |> 
  table()
## 
##  9E  AA  B6  DL  F9  FL  MQ  UA  US  WN 
##   1 430   3  10   3  12 234  23   9   3
missing_tailnum_carrier_flights |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    filterable = TRUE,
    defaultPageSize = 5
  )