4.6 Visualizations for Categorical Data: Exploring the OKCupid dataset

OkCupid is an online dating site that serves international users. Kim and Escobedo-Land (2015) describe a data set where over 50,000 profiles from the San Francisco area.

load(url("https://github.com/topepo/FES/blob/master/Data_Sets/OkCupid/okc.RData?raw=true"))

First look at the dataset

# bind 'okc_test'
okc <- okc_train %>%
  bind_rows(okc_test)

Skim okc

skimr::skim(okc) %>% 
  knitr::kable()

skim_type	skim_variable	complete_rate	factor.ordered	factor.n_unique	factor.top_counts	numeric.mean	numeric.sd	numeric.p0	numeric.p25	numeric.p50	numeric.p75	numeric.p100	numeric.hist
factor	diet	1	FALSE	19	die: 19691, mos: 15229, any: 5440, str: 4583	NA	NA	NA	NA	NA	NA	NA	NA
factor	drinks	1	FALSE	7	soc: 36756, rar: 5328, oft: 4531, not: 2868	NA	NA	NA	NA	NA	NA	NA	NA
factor	drugs	1	FALSE	4	nev: 32805, dru: 11758, som: 6818, oft: 366	NA	NA	NA	NA	NA	NA	NA	NA
factor	education	1	FALSE	33	gra: 21423, gra: 8151, wor: 5193, ed_: 3572	NA	NA	NA	NA	NA	NA	NA	NA
factor	income	1	FALSE	13	mis: 40584, inc: 2876, inc: 1572, inc: 1080	NA	NA	NA	NA	NA	NA	NA	NA
factor	offspring	1	FALSE	16	kid: 29360, doe: 6744, doe: 3653, doe: 3322	NA	NA	NA	NA	NA	NA	NA	NA
factor	pets	1	FALSE	16	pet: 15181, lik: 13642, lik: 6534, lik: 3988	NA	NA	NA	NA	NA	NA	NA	NA
factor	religion	1	FALSE	10	rel: 15333, agn: 8043, oth: 7231, ath: 6316	NA	NA	NA	NA	NA	NA	NA	NA
factor	sign	1	FALSE	13	sig: 7818, leo: 3922, gem: 3911, can: 3784	NA	NA	NA	NA	NA	NA	NA	NA
factor	smokes	1	FALSE	6	no: 38941, smo: 3619, som: 3268, whe: 2688	NA	NA	NA	NA	NA	NA	NA	NA
factor	status	1	FALSE	5	sin: 48032, see: 1814, ava: 1624, mar: 269	NA	NA	NA	NA	NA	NA	NA	NA
factor	where_state	1	FALSE	36	cal: 51672, new: 15, ill: 6, mas: 4	NA	NA	NA	NA	NA	NA	NA	NA
factor	where_town	1	FALSE	51	san: 26683, oak: 6214, ber: 3616, san: 1168	NA	NA	NA	NA	NA	NA	NA	NA
factor	religion_modifer	1	FALSE	5	rel: 25718, but: 11469, and: 8309, and: 4221	NA	NA	NA	NA	NA	NA	NA	NA
factor	sign_modifer	1	FALSE	4	sig: 17909, and: 17709, but: 15511, and: 618	NA	NA	NA	NA	NA	NA	NA	NA
factor	Class	1	FALSE	2	oth: 42190, ste: 9557	NA	NA	NA	NA	NA	NA	NA	NA
numeric	age	1	NA	NA	NA	3.255509e+01	9.518919e+00	18	26.000000	30.000000	37.000000	109.000000	▇▂▁▁▁
numeric	height	1	NA	NA	NA	6.833121e+01	3.979818e+00	1	66.000000	68.000000	71.000000	95.000000	▁▁▁▇▁
numeric	last_online	1	NA	NA	NA	3.916536e+01	7.625808e+01	0	1.000000	4.000000	30.000000	370.000000	▇▁▁▁▁
numeric	cpp	1	NA	NA	NA	2.898700e-03	5.376220e-02	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	cpp_fluently	1	NA	NA	NA	1.252250e-02	1.112020e-01	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	cpp_okay	1	NA	NA	NA	9.855600e-03	9.878610e-02	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	cpp_poorly	1	NA	NA	NA	7.382100e-03	8.560210e-02	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	lisp	1	NA	NA	NA	5.991000e-04	2.446880e-02	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	lisp_fluently	1	NA	NA	NA	1.488000e-03	3.854640e-02	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	lisp_okay	1	NA	NA	NA	2.319000e-03	4.810030e-02	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	lisp_poorly	1	NA	NA	NA	2.280300e-03	4.769870e-02	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	asian	1	NA	NA	NA	1.371094e-01	3.439661e-01	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	black	1	NA	NA	NA	5.637040e-02	2.306379e-01	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	hispanic_latin	1	NA	NA	NA	8.937720e-02	2.852901e-01	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	indian	1	NA	NA	NA	2.467780e-02	1.551426e-01	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	middle_eastern	1	NA	NA	NA	1.571110e-02	1.243564e-01	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	native_american	1	NA	NA	NA	2.125730e-02	1.442422e-01	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	other	1	NA	NA	NA	6.083440e-02	2.390287e-01	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	pacific_islander	1	NA	NA	NA	2.475510e-02	1.553793e-01	0	0.000000	0.000000	0.000000	1.000000	▇▁▁▁▁
numeric	white	1	NA	NA	NA	6.474578e-01	4.777663e-01	0	0.000000	1.000000	1.000000	1.000000	▅▁▁▁▇
numeric	essay_length	1	NA	NA	NA	3.132866e+00	6.777757e-01	0	3.009026	3.269513	3.480438	4.983486	▁▁▂▇▁
numeric	profile	1	NA	NA	NA	2.587400e+04	1.493822e+04	1	12937.500000	25874.000000	38810.500000	51747.000000	▇▇▇▇▇

Plot Class

okc %>% 
  ggplot(aes(Class, fill = Class)) + 
  geom_bar() + 
  theme(legend.position = 'none')

4.6.1 Visualizing Relationships between Outcomes and Predictors

4.6.1.1 Outcome and Categorical Predictor

Let’s plot the frequency of the stated religion, partitioned and color by the outcome (Class).

binom_stats <- function(x, ...) {
  x <- x$Class[!is.na(x$Class)]
  res <- prop.test(x = sum(x == "stem"), n = length(x), ...)
  data.frame(Proportion  = unname(res$estimate), 
             Lower = res$conf.int[1],
             Upper = res$conf.int[2])
}

stem_rate <- mean(okc_train$Class == "stem")

religion_rates <- 
  okc_train %>%
  group_by(religion) %>%
  do(binom_stats(.)) %>%
  arrange(Proportion) %>%
  ungroup() %>%
  mutate(religion = gsub("religion_", "", religion),
         religion = reorder(factor(religion), Proportion))

okc_train <- 
  okc_train %>% 
  mutate(
    religion2 = gsub("religion_", "", as.character(religion)),
    religion2 = factor(religion2, levels = as.character(religion_rates$religion))
  )

bars <- 
  ggplot(okc_train, aes(x = religion2, fill = Class)) +
  geom_bar(position = position_dodge()) + scale_fill_brewer(palette = "Paired") +
  xlab("") +
  theme(legend.position = "top", axis.text = element_text(size = 8)) +
  ggtitle("(a)")

stacked_vars <-
  ggplot(okc_train, aes(x = religion2, fill = Class)) + geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Paired") +
  xlab("") + ylab("Proportion") +
  theme(legend.position = "none", axis.text = element_text(size = 8)) +
  ggtitle("(b)")

ci_plots <- 
  ggplot(religion_rates, aes(x = religion, y = Proportion)) +
  geom_hline(yintercept = stem_rate, col = "red", alpha = .35, lty = 2) + 
  geom_point() +
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = .1) +
  theme(axis.text = element_text(size = 8)) +
  xlab("") +
  ggtitle("(c)")

bars / stacked_vars / ci_plots

Does religion appear to be related to the outcome? Since there is a gradation of rates of STEM professions between the groups, it would appear so.

4.6.1.2 Outcome and Numerical Predictor

Now, let’s visualize the relationship between a categorical outcome (Class) and a numerical predictor (essay_length).

l10_breaks <- scales::trans_breaks("log10", function(x) 10^x)
l10_labels <- scales::trans_format("log10", scales::math_format(10^.x))

gam_dat <- 
  okc_train %>% 
  dplyr::select(essay_length, Class) %>% 
  arrange(essay_length)

gam_small <- 
  gam_dat %>%
  distinct(essay_length) 

gam_mod <- mgcv::gam(Class ~ s(essay_length), data = gam_dat, family = binomial())

gam_small <- gam_small %>%
  mutate(
    link = -predict(gam_mod, gam_small, type = "link"),
    se = predict(gam_mod, gam_small, type = "link", se.fit = TRUE)$se.fit,
    upper = link + qnorm(.975) * se,
    lower = link - qnorm(.975) * se,
    lower = binomial()$linkinv(lower),
    upper = binomial()$linkinv(upper),
    probability = binomial()$linkinv(link)
  )

brks <- l10_breaks(exp(okc_train$essay_length))

essay_hist <- 
  ggplot(okc_train, aes(x = exp(essay_length))) + 
  geom_histogram(binwidth = .1, col = "#FEB24C", fill = "#FED976") + 
  facet_wrap(~ Class, ncol = 1) + 
  scale_x_log10(breaks = brks, labels = l10_labels) +
  xlab("Essay Character Length") + 
  theme_bw() +
  theme(plot.margin = unit(c(0,1,0,1.2), "cm")) + 
  ggtitle("(a)")

essay_gam <- 
  ggplot(gam_small, aes(x = exp(essay_length))) + 
  geom_line(aes(y = probability)) + 
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = "grey", alpha = .5) + 
  geom_hline(yintercept = stem_rate, col = "red", alpha = .35, lty = 2)  + 
  scale_x_log10(breaks = brks, labels = l10_labels) +
  theme_bw() + 
  xlab("") +
  theme(plot.margin = unit(c(0,1,0,1.2), "cm"))+ 
  ggtitle("(b)")

essay_hist / essay_gam

The black line represents the class probability of the logistic regression model and the bands denote 95% confidence intervals around the fit. The horizontal red line indicates the baseline probability of STEM profiles from the training set.

This predictor might be worth including in a model but is unlikely to show a strong effect on its own.

4.6.2 Exploring Relationships Between Categorical Predictors

When considering relationships between categorical data, there are several options. Once a cross-tabulation between variables is created, mosaic plots can once again be used to understand the relationship between variables.

okc_train <- 
  okc_train %>% 
  mutate(
    drugs = factor(as.character(drugs),
                   levels = c("drugs_missing", "never", "sometimes", "often")),
    drinks = factor(as.character(drinks),
                    levels = c("drinks_missing", "not_at_all", "rarely", 
                               "socially", "often", "very_often", "desperately"))
  )

dd_tab <- table(okc_train$drugs, okc_train$drinks, dnn = c("Drugs", "Alcohol"))


# Formatting for slightly better printing
plot_tab <- dd_tab
dimnames(plot_tab)[[1]][1] <- "missing"
dimnames(plot_tab)[[2]] <- gsub("_", " ", dimnames(plot_tab)[[2]])
dimnames(plot_tab)[[2]][1] <- "missing"
dimnames(plot_tab)[[2]][6] <- "often\n"
dimnames(plot_tab)[[2]][6] <- "very often"
dimnames(plot_tab)[[2]][7] <- "\ndesperately"

mosaic(
  t(plot_tab),
  highlighting = TRUE,
  highlighting_fill = rainbow_hcl,
  margins = unit(c(6, 1, 1, 8), "lines"),
  labeling = labeling_border(
    rot_labels = c(90, 0, 0, 0),
    just_labels = c("left", "right",
                    "center",  "right"),
    offset_varnames = unit(c(3, 1, 1, 4), "lines")
  ),
  keep_aspect_ratio = FALSE
)

In the cross-tabulation between alcohol and drug use, the χ2 statistic is very large (4043.8) for its degrees of freedom (18) and is associated with a very small p-value (0). This indicates that there is a strong association between these two variables.

ca_obj <- CA(dd_tab, graph = FALSE)

ca_drugs <- as.data.frame(ca_obj$row$coord)
ca_drugs$label <- gsub("_", " ", rownames(ca_drugs))
ca_drugs$Variable <- "Drugs"

ca_drinks <- as.data.frame(ca_obj$col$coord)
ca_drinks$label <- gsub("_", " ", rownames(ca_drinks))
ca_drinks$Variable <- "Alcohol"

ca_rng <- extendrange(c(ca_drinks$`Dim 1`, ca_drinks$`Dim 2`))
ca_x <- paste0("Dimension #1 (",
               round(ca_obj$eig["dim 1", "percentage of variance"], 0),
               "%)")
ca_y <- paste0("Dimension #2 (",
               round(ca_obj$eig["dim 2", "percentage of variance"], 0),
               "%)")

ca_coord <- rbind(ca_drugs, ca_drinks)

ca_plot <-
  ggplot(ca_coord, aes(x = `Dim 1`, y = `Dim 2`, col = Variable)) + 
  geom_vline(xintercept = 0) + 
  geom_hline(yintercept = 0) + 
  geom_text(aes(label = label)) + 
  xlim(ca_rng) + ylim(ca_rng) + 
  xlab(ca_x) + ylab(ca_y) + 
  coord_equal()

ca_plot

## Warning: Removed 1 rows containing missing values (`geom_text()`).

The correspondence analysis principal coordinates for the drug and alcohol data in the OkCupid data.