9.3 9.2 Words in newsgroups
tf-idf within newsgroups to see which topics are the more frequent and influential
First step: get the frequency of words by newsgroup:
<- usenet_words %>%
words_by_newsgroup count(newsgroup, word, sort = TRUE) %>%
ungroup()
<- words_by_newsgroup %>%
tf_idf bind_tf_idf(word, newsgroup, n) %>%
arrange(desc(tf_idf))
head(tf_idf)
VISUALIZE EACH BOARD IN A TOPIC
%>%
tf_idf filter(str_detect(newsgroup, "^talk\\.")) %>%
group_by(newsgroup) %>%
slice_max(tf_idf, n = 12) %>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(tf_idf, word, fill = newsgroup)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ newsgroup, scales = "free") +
labs(x = "tf-idf", y = NULL)
Calculating pair-wise correlations between words within a newsgroup
library(widyr)
<- words_by_newsgroup %>%
newsgroup_cors # newsgroup = items to compare, feature = what link one item to the others = word,
# value = to do correlation on - n
pairwise_cor(newsgroup, word, n, sort = TRUE)
head(newsgroup_cors)
Topic Modeling!
Latent Dirichlet Allocation (LDA) will be used to sort Usenet messages from different newsgroups.
First, create the Document-Term matrix
# include only words that occur at least 50 times
<- usenet_words %>%
word_talk_newsgroups # take only the sci related topics
filter(str_detect(newsgroup, "^talk")) %>%
group_by(word) %>%
mutate(word_total = n()) %>%
ungroup() %>%
filter(word_total > 50)
# N = 70,794
# convert into a document-term matrix
# with document names that combines topic.word_n
<- word_talk_newsgroups %>%
talk_dtm unite(document, newsgroup, id) %>%
count(document, word) %>%
cast_dtm(document, word, n)
# Returns Large Document Matrix: 1896 x 662
Do the topic modeling!
library(topicmodels)
<- LDA(talk_dtm, k = 4, control = list(seed = 2016)) talk_lda
Visualize the modeling to see if the same newsgroups were formed!
%>%
talk_lda tidy() %>%
group_by(topic) %>%
slice_max(beta, n = 8) %>%
ungroup() %>%
# reorder each term by beta coef within each topic
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
(skipped gamma distribution visualization)