8.8 Missing data patterns for stations originally in the Chicago ridership data

Fig. 8.5

miss_entries <- 
     raw_entries %>%
     dplyr::select(-date) %>%
     is.na() 
miss_num <- apply(miss_entries, 2, sum)

has_missing <- vapply(raw_entries[, -1], function(x) sum(is.na(x)) > 1, logical(1))
miss_station <- names(has_missing)[has_missing]

# do clustering on just the station data (not time) and get a reordering 
# of the stations for plotting
miss_data <- 
     raw_entries[, miss_station] %>%
     is.na()

clst <- hclust(dist(t(miss_data)))
clst_stations <- 
     tibble(
          station_id = colnames(miss_data),
          order = clst$order
     )

station_names <- 
     stations %>% 
     dplyr::select(name, station_id) %>%
     right_join(clst_stations, by = "station_id") 

station_lvl <- station_names[["name"]][station_names$order]

miss_vert <-
     raw_entries %>%
     gather(station_id, raw_entries, -date) %>%
     filter(station_id %in% miss_station) %>%
     mutate(status = ifelse(is.na(raw_entries), "missing", "complete")) %>%
     full_join(station_names, by = "station_id") %>%
     mutate(
          name = factor(name, levels = station_lvl),
          status = factor(status, levels = c("missing", "complete"))
     )

miss_vert %>%  
     ggplot(aes(x = date, y = name, fill = status)) + 
     geom_tile() + 
     ylab("") + xlab("") + 
     scale_fill_grey() + 
     theme(legend.position = 'top')

There are nine stations whose data are almost complete except for a single month gap. These stations are all on the Red Line and occur during the time of the Red Line Reconstruction Project that affected stations north of Cermak-Chinatown to the 95th Street station.