3.5 Stop word removal in R

library(hcandersenr)
library(tidyverse)
library(tidytext)

fir_tree <- hca_fairytales() %>%
  filter(book == "The fir tree",
         language == "English")

fir_tree
## # A tibble: 253 × 3
##    text                                                           book  language
##    <chr>                                                          <chr> <chr>   
##  1 "Far down in the forest, where the warm sun and the fresh air… The … English 
##  2 "resting-place, grew a pretty little fir-tree; and yet it was… The … English 
##  3 "wished so much to be tall like its companions– the pines and… The … English 
##  4 "around it. The sun shone, and the soft air fluttered its lea… The … English 
##  5 "little peasant children passed by, prattling merrily, but th… The … English 
##  6 "them not. Sometimes the children would bring a large basket … The … English 
##  7 "strawberries, wreathed on a straw, and seat themselves near … The … English 
##  8 "say, \"Is it not a pretty little tree?\" which made it feel … The … English 
##  9 "before."                                                      The … English 
## 10 "And yet all this while the tree grew a notch or joint taller… The … English 
## # … with 243 more rows
tidy_fir_tree <- fir_tree %>%
  unnest_tokens(word, text)

tidy_fir_tree
## # A tibble: 3,288 × 3
##    book         language word  
##    <chr>        <chr>    <chr> 
##  1 The fir tree English  far   
##  2 The fir tree English  down  
##  3 The fir tree English  in    
##  4 The fir tree English  the   
##  5 The fir tree English  forest
##  6 The fir tree English  where 
##  7 The fir tree English  the   
##  8 The fir tree English  warm  
##  9 The fir tree English  sun   
## 10 The fir tree English  and   
## # … with 3,278 more rows
  • Stopwords that return vector
tidy_fir_tree %>%
  filter(!(word %in% stopwords(source = "snowball")))
## # A tibble: 1,547 × 3
##    book         language word   
##    <chr>        <chr>    <chr>  
##  1 The fir tree English  far    
##  2 The fir tree English  forest 
##  3 The fir tree English  warm   
##  4 The fir tree English  sun    
##  5 The fir tree English  fresh  
##  6 The fir tree English  air    
##  7 The fir tree English  made   
##  8 The fir tree English  sweet  
##  9 The fir tree English  resting
## 10 The fir tree English  place  
## # … with 1,537 more rows
  • If we use the get_stopwords() function from tidytext instead, then we can use the anti_join() function.
tidy_fir_tree %>%
  anti_join(get_stopwords(source = "snowball"))
## Joining, by = "word"
## # A tibble: 1,547 × 3
##    book         language word   
##    <chr>        <chr>    <chr>  
##  1 The fir tree English  far    
##  2 The fir tree English  forest 
##  3 The fir tree English  warm   
##  4 The fir tree English  sun    
##  5 The fir tree English  fresh  
##  6 The fir tree English  air    
##  7 The fir tree English  made   
##  8 The fir tree English  sweet  
##  9 The fir tree English  resting
## 10 The fir tree English  place  
## # … with 1,537 more rows
  • It is perfectly acceptable to start with a premade word list and remove or append additional words according to your particular use case.

  • adding to the built-in stopword list

toks <- tokens("The judge will sentence Mr. Adams to nine years in 
               prison", remove_punct = TRUE)
toks
## Tokens consisting of 1 document.
## text1 :
##  [1] "The"      "judge"    "will"     "sentence" "Mr"       "Adams"   
##  [7] "to"       "nine"     "years"    "in"       "prison"
  • adding : “will”, “mr”, “nine”
tokens_remove(toks, c(stopwords("english"), "will", "mr", "nine"))
## Tokens consisting of 1 document.
## text1 :
## [1] "judge"    "sentence" "Adams"    "years"    "prison"