18.3 Implicit missing values

18.3.1 Implicit missing values

Consider a simple dataset that records the price of some stock each quarter:

stocks <- tibble::tibble(
  year  = c(2020, 2020, 2020, 2020, 2021, 2021, 2021),
  qtr   = c(   1,    2,    3,    4,    2,    3,    4),
  price = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

This dataset has two missing observations:

  • The price in the fourth quarter of 2020 is explicitly missing, because its value is NA.

  • The price for the first quarter of 2021 is implicitly missing, because it simply does not appear in the dataset.

If there is a need to make implicit missing values explicit, we can pivot the data using tidyr::pivot_wider.

wide_stocks <-  stocks |>
  tidyr::pivot_wider(
    names_from = "qtr", 
    values_from = "price"
  )

wide_stocks
## # A tibble: 2 × 5
##    year   `1`   `2`   `3`   `4`
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  2020  1.88  0.59  0.35 NA   
## 2  2021 NA     0.92  0.17  2.66

By default, making data longer using tidyr::pivot_longer preserves explicit missing values. We can drop them (make them implicit) by setting values_drop_na = TRUE.

wide_stocks |>
  tidyr::pivot_longer(
    cols = -c("year"),
    names_to = "qtr", 
    values_to = "price"
  )
## # A tibble: 8 × 3
##    year qtr   price
##   <dbl> <chr> <dbl>
## 1  2020 1      1.88
## 2  2020 2      0.59
## 3  2020 3      0.35
## 4  2020 4     NA   
## 5  2021 1     NA   
## 6  2021 2      0.92
## 7  2021 3      0.17
## 8  2021 4      2.66
wide_stocks |>
  tidyr::pivot_longer(
    cols = -c("year"),
    names_to = "qtr", 
    values_to = "price",
    values_drop_na = TRUE
  )
## # A tibble: 6 × 3
##    year qtr   price
##   <dbl> <chr> <dbl>
## 1  2020 1      1.88
## 2  2020 2      0.59
## 3  2020 3      0.35
## 4  2021 2      0.92
## 5  2021 3      0.17
## 6  2021 4      2.66

tidyr::complete() turns implicit missing values into explicit missing values based on combination values from its input columns.

stocks |>
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 7
  )
year
qtr
price
2020
1
1.88
2020
2
0.59
2020
3
0.35
2020
4
2021
2
0.92
2021
3
0.17
2021
4
2.66
stocks |>
  tidyr::complete(
    .data[["year"]], 
    .data[["qtr"]]) |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 4
  )
year
qtr
price
2020
1
1.88
2020
2
0.59
2020
3
0.35
2020
4
1–4 of 8 rows

Sometimes the individual variables are themselves incomplete and they is a need to provide your own data. For example, if we know that the stocks dataset is supposed to run from 2019 to 2021, we could explicitly supply those values for year.

stocks |>
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 7
  )
year
qtr
price
2020
1
1.88
2020
2
0.59
2020
3
0.35
2020
4
2021
2
0.92
2021
3
0.17
2021
4
2.66
stocks |>
  tidyr::complete(
    `year` = 2019:2021, 
    .data[["qtr"]]) |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 4
  )
year
qtr
price
2019
1
2019
2
2019
3
2019
4
1–4 of 12 rows

Another way to reveal implicitly missing observations is by using dplyr::anti_join. Here, four of the destinations do not have any airport metadata information.

# Get unique destination and rename to faa

dest_flights <- nycflights13::flights |> 
  dplyr::distinct(faa = .data[["dest"]])

dest_flights |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 5
  )
faa
IAH
MIA
BQN
ATL
ORD
1–5 of 105 rows
...
dest_flights |> 
  dplyr::anti_join(
    y = nycflights13::airports,
    by = dplyr::join_by("faa")
 ) |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 5
  )
faa
BQN
SJU
STT
PSE

Here, 722 planes do not have any planes metadata information.

# Get unique tail numbers

tailnum_flights <- nycflights13::flights |> 
  dplyr::distinct(.data[["tailnum"]])

tailnum_flights |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 5
  )
tailnum
N14228
N24211
N619AA
N804JB
N668DN
1–5 of 4044 rows
...
tailnum_flights |> 
  dplyr::anti_join(
    y = nycflights13::planes,
    by = dplyr::join_by("tailnum")
 ) |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    defaultPageSize = 5
  )
tailnum
N3ALAA
N3DUAA
N542MQ
N730MQ
N9EAMQ
1–5 of 722 rows
...

18.3.2 dplyr::antijoin Extra

Use dplyr::anti_join to isolate rows causing dplyr::inner_join error.

Extra Weight Case:

three_penguins <- tibble::tribble(
  ~samp_id, ~species,    ~island,
  1,        "Adelie",    "Torgersen",
  2,        "Gentoo",    "Biscoe",
  3,        "Chinstrap", "Dream"
)

weight_extra <- tibble::tribble(
  ~samp_id,  ~body_mass_g,
  0,         1500,
  1,         3220,
  2,         4730,
  3,         4000,
  4,         1000,
  5,         1100
)

three_penguins |> 
  dplyr::inner_join(
    y = weight_extra,
    by = dplyr::join_by("samp_id"),
    unmatched = "error"
 ) 
## Error in `dplyr::inner_join()`:
## ! Each row of `y` must be matched by `x`.
## ℹ Row 1 of `y` was not matched.
weight_extra <- tibble::tribble(
  ~samp_id,  ~body_mass_g,
  0,         1500,
  1,         3220,
  2,         4730,
  3,         4000,
  4,         1000,
  5,         1100
)

weight_extra |> 
  dplyr::anti_join(
    y = three_penguins,
    by = dplyr::join_by("samp_id")
 ) 
## # A tibble: 3 × 2
##   samp_id body_mass_g
##     <dbl>       <dbl>
## 1       0        1500
## 2       4        1000
## 3       5        1100

Weight 3 Missing Case:

three_penguins <- tibble::tribble(
  ~samp_id, ~species,    ~island,
  1,        "Adelie",    "Torgersen",
  2,        "Gentoo",    "Biscoe",
  3,        "Chinstrap", "Dream"
)

weight_no_3 <- tibble::tribble(
  ~samp_id,  ~body_mass_g,
  1,         3220,
  2,         4730
)

three_penguins |> 
  dplyr::inner_join(
    y = weight_no_3,
    by = dplyr::join_by("samp_id"),
    unmatched = "error"
 ) 
## Error in `dplyr::inner_join()`:
## ! Each row of `x` must have a match in `y`.
## ℹ Row 3 of `x` does not have a match.
three_penguins |> 
  dplyr::anti_join(
    y = weight_no_3,
    by = dplyr::join_by("samp_id")
 ) 
## # A tibble: 1 × 3
##   samp_id species   island
##     <dbl> <chr>     <chr> 
## 1       3 Chinstrap Dream

Unfortunately cannot resolve multiple matches. Use argument both relationship = "one-to-one" and unmatched = "error" to ensure one row from x matches with exactly one row of y.

three_penguins <- tibble::tribble(
  ~samp_id, ~species,    ~island,
  1,        "Adelie",    "Torgersen",
  2,        "Gentoo",    "Biscoe",
  3,        "Chinstrap", "Dream"
)

weight_extra_2 <- tibble::tribble(
  ~samp_id,  ~body_mass_g,
  1,         3220,
  2,         4730,
  2,         4725,
  3,         4000
)

three_penguins |> 
  dplyr::inner_join(
    y = weight_extra_2,
    by = dplyr::join_by("samp_id"),
    relationship = "one-to-one",
    unmatched = "error"
 ) 
## Error in `dplyr::inner_join()`:
## ! Each row in `x` must match at most 1 row in `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.

18.3.3 Exercises

Can you find any relationship between the carrier and the rows that appear to be missing from planes ?

We first get all distinct carriers and tail numbers. We do a left join with the nycflights13::airlines so that we know what the carrier abbreviation means.

tailnum_carrier_flights <- nycflights13::flights |> 
  dplyr::distinct(.data[["tailnum"]], .data[["carrier"]]) |> 
  dplyr::arrange(.data[["carrier"]]) |> 
  dplyr::left_join(
    nycflights13::airlines,
    by = dplyr::join_by("carrier")
  )

tailnum_carrier_flights |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    filterable = TRUE,
    defaultPageSize = 5
  )
tailnum
carrier
name
N915XJ
9E
Endeavor Air Inc.
N8444F
9E
Endeavor Air Inc.
N920XJ
9E
Endeavor Air Inc.
N8409N
9E
Endeavor Air Inc.
N8631E
9E
Endeavor Air Inc.
1–5 of 4067 rows
...

We now use dplyr::anti_join with nycflights13::planes to identify tail numbers that have no plane information.

We can see that most of them come from either AA (American Airlines Inc.) or MQ (Envoy Air)

missing_tailnum_carrier_flights <- tailnum_carrier_flights |> 
  dplyr::anti_join(
    y = nycflights13::planes,
    by = dplyr::join_by("tailnum")
 ) 

missing_tailnum_carrier_flights[["carrier"]] |> 
  table()
## 
##  9E  AA  B6  DL  F9  FL  MQ  UA  US  WN 
##   1 430   3  10   3  12 234  23   9   3
missing_tailnum_carrier_flights |> 
  reactable::reactable(
    theme = reactablefmtr::dark(),
    filterable = TRUE,
    defaultPageSize = 5
  )
tailnum
carrier
name
9E
Endeavor Air Inc.
N3ALAA
AA
American Airlines Inc.
N3DUAA
AA
American Airlines Inc.
N3EMAA
AA
American Airlines Inc.
N3BAAA
AA
American Airlines Inc.
1–5 of 728 rows
...