4.2 Tokeninzing by n-grams

n-gram: pair of adjacent words. Useful for identifying frequencies in which certain words appear together so that a model of their relationship can be built.

library(dplyr)
library(tidytext)
library(janeaustenr)

# utilize unnest_tokens() however specify the token is "ngram" and n instead of by words
austen_bigrams <- austen_books() %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

head(austen_bigrams)

## # A tibble: 6 × 2
##   book                bigram         
##   <fct>               <chr>          
## 1 Sense & Sensibility sense and      
## 2 Sense & Sensibility and sensibility
## 3 Sense & Sensibility <NA>           
## 4 Sense & Sensibility by jane        
## 5 Sense & Sensibility jane austen    
## 6 Sense & Sensibility <NA>

OK, how about again with some real data:

# from Kaggle: https://www.kaggle.com/datatattle/covid-19-nlp-text-classification?select=Corona_NLP_train.csv
covid_tweets <- readr::read_csv("data/Corona_NLP_train.csv")

## Rows: 41157 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Location, TweetAt, OriginalTweet, Sentiment
## dbl (2): UserName, ScreenName
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

covid_bigrams <- covid_tweets %>%
  select(OriginalTweet, Sentiment) %>% 
  unnest_tokens(bigram, OriginalTweet, token = "ngrams", n = 2)

head(covid_bigrams)

## # A tibble: 6 × 2
##   Sentiment bigram             
##   <chr>     <chr>              
## 1 Neutral   menyrbie phil_gahan
## 2 Neutral   phil_gahan chrisitv
## 3 Neutral   chrisitv https     
## 4 Neutral   https t.co         
## 5 Neutral   t.co ifz9fan2pa    
## 6 Neutral   ifz9fan2pa and

This output clearly needs to be filtered

covid_bigrams %>%
  count(bigram, sort = TRUE)

## # A tibble: 474,761 × 2
##    bigram                n
##    <chr>             <int>
##  1 https t.co        23953
##  2 covid 19          11687
##  3 grocery store      4775
##  4 to the             3873
##  5 in the             3639
##  6 of the             3046
##  7 the coronavirus    2174
##  8 the grocery        2138
##  9 the supermarket    1890
## 10 coronavirus https  1825
## # … with 474,751 more rows

Filter stop-words. stop-words: uninteresting or common words such as “of”, “the”, “be”

In order to filter out stop words, we need to separate out the bigrams into separate columns using the separate() function from tidyr.

library(tidyr)

bigrams_separated <- covid_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

# n = 1,275,993 to n = 393,315
bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)
# n = 216,367

Clearly there are a lot of people posting links on Twitter (t.co) because of the shortened URLs.

Now that we’ve filtered out the stopwords, let’s unite the words to create more true bigrams (no stopwords) again.

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigrams_united

## # A tibble: 393,315 × 2
##    Sentiment bigram             
##    <chr>     <chr>              
##  1 Neutral   menyrbie phil_gahan
##  2 Neutral   phil_gahan chrisitv
##  3 Neutral   chrisitv https     
##  4 Neutral   https t.co         
##  5 Neutral   t.co ifz9fan2pa    
##  6 Neutral   https t.co         
##  7 Neutral   t.co xx6ghgfzcc    
##  8 Neutral   https t.co         
##  9 Neutral   t.co i2nlzdxno8    
## 10 Positive  advice talk        
## # … with 393,305 more rows

IF we were interested in trigrams, we can repeat the sequence with n=3

covid_tweets %>%
  select(OriginalTweet, Sentiment) %>% 
  unnest_tokens(trigram, OriginalTweet, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

## # A tibble: 187,821 × 4
##    word1       word2 word3        n
##    <chr>       <chr> <chr>    <int>
##  1 coronavirus https t.co      1822
##  2 covid       19    pandemic   917
##  3 covid19     https t.co       551
##  4 19          https t.co       512
##  5 covid       19    https      508
##  6 grocery     store workers    432
##  7 covid       19    outbreak   386
##  8 covid       19    crisis     385
##  9 covid_19    https t.co       365
## 10 pandemic    https t.co       363
## # … with 187,811 more rows

4.2.1 Analyzing bigrams

This dataset does not really give us a grouping variable like the Austen data but they do include sentiment. Let’s try grouping by sentiments the curators have determined the tweets to be to get the which words are most associated with “shopping”.

bigrams_filtered %>%
  filter(word2 == "shopping") %>%
  count(Sentiment, word1, sort = TRUE)

## # A tibble: 571 × 3
##    Sentiment          word1       n
##    <chr>              <chr>   <int>
##  1 Positive           online    398
##  2 Neutral            online    305
##  3 Negative           online    298
##  4 Extremely Positive online    249
##  5 Positive           grocery   121
##  6 Extremely Negative online    102
##  7 Neutral            grocery    79
##  8 Negative           grocery    56
##  9 Extremely Positive grocery    44
## 10 Negative           panic      39
## # … with 561 more rows

Bigrams can be used treated like documents. We can look at the tf-idf and visualize based on sentiment.

bigram_tf_idf <- bigrams_united %>%
  count(Sentiment, bigram) %>%
  bind_tf_idf(bigram, Sentiment, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf

## # A tibble: 252,318 × 6
##    Sentiment          bigram                     n       tf   idf   tf_idf
##    <chr>              <chr>                  <int>    <dbl> <dbl>    <dbl>
##  1 Extremely Negative price war                 57 0.00106  0.511 0.000539
##  2 Extremely Negative stop panic               110 0.00204  0.223 0.000455
##  3 Extremely Positive strong amp                17 0.000255 1.61  0.000411
##  4 Extremely Negative terroristic threats       13 0.000241 1.61  0.000387
##  5 Extremely Positive experiencing hardships    14 0.000210 1.61  0.000338
##  6 Extremely Negative walmart trader            19 0.000352 0.916 0.000322
##  7 Extremely Negative food shortages            34 0.000630 0.511 0.000322
##  8 Extremely Negative break eggs                10 0.000185 1.61  0.000298
##  9 Extremely Negative milk break                10 0.000185 1.61  0.000298
## 10 Extremely Positive friends safe              12 0.000180 1.61  0.000290
## # … with 252,308 more rows

Visualizing tf-idf

library(forcats)
library(ggplot2)

bigram_tf_idf %>%
  group_by(Sentiment) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(bigram, tf_idf), fill = Sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Sentiment, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)

Takeaway- bigrams are informative and can make tokens more understandable they do make the counts more sparse (a two-word pair is more rare). These can be useful in very large datasets

4.2.2 Using bigrams to provide context in sentiment analysis

This dataset already contains sentiment of the overall tweet but as we saw in the tf-idf visual, they don’t really make much sense in context of just the bigram. So, let’s re-do it. This could make a difference given the context, such as the usage of “not” before “happy”.

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)

## # A tibble: 1,135 × 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 not   to      220
##  2 not   a       181
##  3 not   be      177
##  4 not   the     149
##  5 not   only    111
##  6 not   going    86
##  7 not   just     86
##  8 not   have     73
##  9 not   panic    70
## 10 not   sure     69
## # … with 1,125 more rows

AFINN will be used to assign a numeric value for each word associated with “not”.

Note: You need to run get_sentiments interactively (to approve the download) per licensing requirements, so we can’t show those results in this online version.

AFINN <- get_sentiments("afinn")

# get the most frequent words preceded by "not"
not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)

# n = 194
not_words

The most common sentiment-associated word following “not” is “panic”. Panic is pretty negative but NOT panic can be more positive.

Computing how influential the certain words were in understanding the context in the wrong direction. This is done in the book by multiplying the their frequency by their sentiment value.

not_words %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  labs(x = "Sentiment value * number of occurrences",
       y = "Words preceded by \"not\"")

Panic looks very influential.

Let’s try again with more negation terms

negation_words <- c("not", "no", "never", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)
# n=342

negated_words %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~word1, ncol = 2, scales = "free") +
  labs(x = "Sentiment value * number of occurrences",
       y = "Words preceded by \"not\"")

4.2.3 Visualizing network of bigrams with ggraph

Relationships between words can be visualized using a node graph. nodes: subject (where the edge is coming from), object (where the edge is going to), edge(association between nodes that have weight)

library(igraph)

# original counts
bigram_counts

# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()

bigram_graph

Now that the igraph object has been created, we must plot it with ggraph!

set.seed(2017)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

This is what it would look like:

set.seed(2020)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

This is a visualization of a Markov Chain

Markov Chain: common model in text analysis. It is a stochastic model that describes a sequence of possible events where the probability of a subsequent event depends on the state of the previous event. In this case, words are assigned probabilities and then the likelihood of the next word depends on the prior word. For example, in a word generator, if the word is “restaurant”, there is a good chance the following word may be “reservation”.