9.3 9.2 Words in newsgroups

tf-idf within newsgroups to see which topics are the more frequent and influential

First step: get the frequency of words by newsgroup:

words_by_newsgroup <- usenet_words %>%
  count(newsgroup, word, sort = TRUE) %>%
  ungroup()

tf_idf <- words_by_newsgroup %>%
  bind_tf_idf(word, newsgroup, n) %>%
  arrange(desc(tf_idf))

head(tf_idf)

VISUALIZE EACH BOARD IN A TOPIC

tf_idf %>%
  filter(str_detect(newsgroup, "^talk\\.")) %>%
  group_by(newsgroup) %>%
  slice_max(tf_idf, n = 12) %>%
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(tf_idf, word, fill = newsgroup)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ newsgroup, scales = "free") +
  labs(x = "tf-idf", y = NULL)

Calculating pair-wise correlations between words within a newsgroup

library(widyr)

newsgroup_cors <- words_by_newsgroup %>%
  # newsgroup = items to compare, feature = what link one item to the others = word,
  # value = to do correlation on - n
  pairwise_cor(newsgroup, word, n, sort = TRUE)

head(newsgroup_cors)

Topic Modeling!

Latent Dirichlet Allocation (LDA) will be used to sort Usenet messages from different newsgroups.

First, create the Document-Term matrix

# include only words that occur at least 50 times
word_talk_newsgroups <- usenet_words %>%
  # take only the sci related topics
  filter(str_detect(newsgroup, "^talk")) %>%
  group_by(word) %>%
  mutate(word_total = n()) %>%
  ungroup() %>%
  filter(word_total > 50)

# N = 70,794

# convert into a document-term matrix
# with document names that combines topic.word_n
talk_dtm <- word_talk_newsgroups %>%
  unite(document, newsgroup, id) %>%
  count(document, word) %>%
  cast_dtm(document, word, n)

# Returns Large Document Matrix: 1896 x 662

Do the topic modeling!

library(topicmodels)

talk_lda <- LDA(talk_dtm, k = 4, control = list(seed = 2016))

Visualize the modeling to see if the same newsgroups were formed!

talk_lda %>%
  tidy() %>%
  group_by(topic) %>%
  slice_max(beta, n = 8) %>%
  ungroup() %>%
  # reorder each term by beta coef within each topic
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

(skipped gamma distribution visualization)