4.2 Tokeninzing by n-grams
n-gram: pair of adjacent words. Useful for identifying frequencies in which certain words appear together so that a model of their relationship can be built.
library(dplyr)
library(tidytext)
library(janeaustenr)
# utilize unnest_tokens() however specify the token is "ngram" and n instead of by words
<- austen_books() %>%
austen_bigrams unnest_tokens(bigram, text, token = "ngrams", n = 2)
head(austen_bigrams)
## # A tibble: 6 × 2
## book bigram
## <fct> <chr>
## 1 Sense & Sensibility sense and
## 2 Sense & Sensibility and sensibility
## 3 Sense & Sensibility <NA>
## 4 Sense & Sensibility by jane
## 5 Sense & Sensibility jane austen
## 6 Sense & Sensibility <NA>
OK, how about again with some real data:
# from Kaggle: https://www.kaggle.com/datatattle/covid-19-nlp-text-classification?select=Corona_NLP_train.csv
<- readr::read_csv("data/Corona_NLP_train.csv") covid_tweets
## Rows: 41157 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Location, TweetAt, OriginalTweet, Sentiment
## dbl (2): UserName, ScreenName
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- covid_tweets %>%
covid_bigrams select(OriginalTweet, Sentiment) %>%
unnest_tokens(bigram, OriginalTweet, token = "ngrams", n = 2)
head(covid_bigrams)
## # A tibble: 6 × 2
## Sentiment bigram
## <chr> <chr>
## 1 Neutral menyrbie phil_gahan
## 2 Neutral phil_gahan chrisitv
## 3 Neutral chrisitv https
## 4 Neutral https t.co
## 5 Neutral t.co ifz9fan2pa
## 6 Neutral ifz9fan2pa and
This output clearly needs to be filtered
%>%
covid_bigrams count(bigram, sort = TRUE)
## # A tibble: 474,761 × 2
## bigram n
## <chr> <int>
## 1 https t.co 23953
## 2 covid 19 11687
## 3 grocery store 4775
## 4 to the 3873
## 5 in the 3639
## 6 of the 3046
## 7 the coronavirus 2174
## 8 the grocery 2138
## 9 the supermarket 1890
## 10 coronavirus https 1825
## # … with 474,751 more rows
Filter stop-words. stop-words: uninteresting or common words such as “of”, “the”, “be”
In order to filter out stop words, we need to separate out the bigrams into separate columns using the separate()
function from tidyr
.
library(tidyr)
<- covid_bigrams %>%
bigrams_separated separate(bigram, c("word1", "word2"), sep = " ")
# n = 1,275,993 to n = 393,315
<- bigrams_separated %>%
bigrams_filtered filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
<- bigrams_filtered %>%
bigram_counts count(word1, word2, sort = TRUE)
# n = 216,367
Clearly there are a lot of people posting links on Twitter (t.co) because of the shortened URLs.
Now that we’ve filtered out the stopwords, let’s unite the words to create more true bigrams (no stopwords) again.
<- bigrams_filtered %>%
bigrams_united unite(bigram, word1, word2, sep = " ")
bigrams_united
## # A tibble: 393,315 × 2
## Sentiment bigram
## <chr> <chr>
## 1 Neutral menyrbie phil_gahan
## 2 Neutral phil_gahan chrisitv
## 3 Neutral chrisitv https
## 4 Neutral https t.co
## 5 Neutral t.co ifz9fan2pa
## 6 Neutral https t.co
## 7 Neutral t.co xx6ghgfzcc
## 8 Neutral https t.co
## 9 Neutral t.co i2nlzdxno8
## 10 Positive advice talk
## # … with 393,305 more rows
IF we were interested in trigrams, we can repeat the sequence with n=3
%>%
covid_tweets select(OriginalTweet, Sentiment) %>%
unnest_tokens(trigram, OriginalTweet, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
## # A tibble: 187,821 × 4
## word1 word2 word3 n
## <chr> <chr> <chr> <int>
## 1 coronavirus https t.co 1822
## 2 covid 19 pandemic 917
## 3 covid19 https t.co 551
## 4 19 https t.co 512
## 5 covid 19 https 508
## 6 grocery store workers 432
## 7 covid 19 outbreak 386
## 8 covid 19 crisis 385
## 9 covid_19 https t.co 365
## 10 pandemic https t.co 363
## # … with 187,811 more rows
4.2.1 Analyzing bigrams
This dataset does not really give us a grouping variable like the Austen data but they do include sentiment. Let’s try grouping by sentiments the curators have determined the tweets to be to get the which words are most associated with “shopping”.
%>%
bigrams_filtered filter(word2 == "shopping") %>%
count(Sentiment, word1, sort = TRUE)
## # A tibble: 571 × 3
## Sentiment word1 n
## <chr> <chr> <int>
## 1 Positive online 398
## 2 Neutral online 305
## 3 Negative online 298
## 4 Extremely Positive online 249
## 5 Positive grocery 121
## 6 Extremely Negative online 102
## 7 Neutral grocery 79
## 8 Negative grocery 56
## 9 Extremely Positive grocery 44
## 10 Negative panic 39
## # … with 561 more rows
Bigrams can be used treated like documents. We can look at the tf-idf and visualize based on sentiment.
<- bigrams_united %>%
bigram_tf_idf count(Sentiment, bigram) %>%
bind_tf_idf(bigram, Sentiment, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
## # A tibble: 252,318 × 6
## Sentiment bigram n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Extremely Negative price war 57 0.00106 0.511 0.000539
## 2 Extremely Negative stop panic 110 0.00204 0.223 0.000455
## 3 Extremely Positive strong amp 17 0.000255 1.61 0.000411
## 4 Extremely Negative terroristic threats 13 0.000241 1.61 0.000387
## 5 Extremely Positive experiencing hardships 14 0.000210 1.61 0.000338
## 6 Extremely Negative walmart trader 19 0.000352 0.916 0.000322
## 7 Extremely Negative food shortages 34 0.000630 0.511 0.000322
## 8 Extremely Negative break eggs 10 0.000185 1.61 0.000298
## 9 Extremely Negative milk break 10 0.000185 1.61 0.000298
## 10 Extremely Positive friends safe 12 0.000180 1.61 0.000290
## # … with 252,308 more rows
Visualizing tf-idf
library(forcats)
library(ggplot2)
%>%
bigram_tf_idf group_by(Sentiment) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(bigram, tf_idf), fill = Sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~Sentiment, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
Takeaway- bigrams are informative and can make tokens more understandable they do make the counts more sparse (a two-word pair is more rare). These can be useful in very large datasets
4.2.2 Using bigrams to provide context in sentiment analysis
This dataset already contains sentiment of the overall tweet but as we saw in the tf-idf visual, they don’t really make much sense in context of just the bigram. So, let’s re-do it. This could make a difference given the context, such as the usage of “not” before “happy”.
%>%
bigrams_separated filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
## # A tibble: 1,135 × 3
## word1 word2 n
## <chr> <chr> <int>
## 1 not to 220
## 2 not a 181
## 3 not be 177
## 4 not the 149
## 5 not only 111
## 6 not going 86
## 7 not just 86
## 8 not have 73
## 9 not panic 70
## 10 not sure 69
## # … with 1,125 more rows
AFINN will be used to assign a numeric value for each word associated with “not”.
Note: You need to run get_sentiments
interactively (to approve the download) per licensing requirements, so we can’t show those results in this online version.
<- get_sentiments("afinn")
AFINN
# get the most frequent words preceded by "not"
<- bigrams_separated %>%
not_words filter(word1 == "not") %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word2, value, sort = TRUE)
# n = 194
not_words
The most common sentiment-associated word following “not” is “panic”. Panic is pretty negative but NOT panic can be more positive.
Computing how influential the certain words were in understanding the context in the wrong direction. This is done in the book by multiplying the their frequency by their sentiment value.
%>%
not_words mutate(contribution = n * value) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(n * value, word2, fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
labs(x = "Sentiment value * number of occurrences",
y = "Words preceded by \"not\"")
Panic looks very influential.
Let’s try again with more negation terms
<- c("not", "no", "never", "without")
negation_words
<- bigrams_separated %>%
negated_words filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, value, sort = TRUE)
# n=342
%>%
negated_words mutate(contribution = n * value) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(n * value, word2, fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
facet_wrap(~word1, ncol = 2, scales = "free") +
labs(x = "Sentiment value * number of occurrences",
y = "Words preceded by \"not\"")
4.2.3 Visualizing network of bigrams with ggraph
Relationships between words can be visualized using a node graph. nodes: subject (where the edge is coming from), object (where the edge is going to), edge(association between nodes that have weight)
library(igraph)
# original counts
bigram_counts
# filter for only relatively common combinations
<- bigram_counts %>%
bigram_graph filter(n > 20) %>%
graph_from_data_frame()
bigram_graph
Now that the igraph object has been created, we must plot it with ggraph!
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
This is what it would look like:
set.seed(2020)
<- grid::arrow(type = "closed", length = unit(.15, "inches"))
a
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
This is a visualization of a Markov Chain
Markov Chain: common model in text analysis. It is a stochastic model that describes a sequence of possible events where the probability of a subsequent event depends on the state of the previous event. In this case, words are assigned probabilities and then the likelihood of the next word depends on the prior word. For example, in a word generator, if the word is “restaurant”, there is a good chance the following word may be “reservation”.