3.7 Stopwords for Pidgin

  • Pidgin is an English-based creole language spoken as a lingua franca

  • Nigerian Pidgin (Naija) is an English-based creole language spoken across Nigeria

  • For example, “Dis food sweet well, well/Dis food sweet no be smal” in Pidgin English translates to “This meal is delicious”

dic <- enframe(read_rds("data/wiktionary_words.rds")) %>% 
  rename(words = value) %>% 
  mutate(word = str_extract_all(words, boundary("word")) ) %>% 
  unnest(word) %>% 
  select(word) %>% 
  arrange(word) 
Piding <- read_tsv("https://raw.githubusercontent.com/keleog/PidginUNMT/master/corpus/monolingual/pidgin_corpus.txt", col_names = FALSE) %>% 
  rename(text = 1)

Piding %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE) %>% 
  mutate(mean_frequency = n/nrow(Piding)) %>% 
  mutate(rank_freq = row_number()) %>% 
  filter(n >2) %>% 
  select(word) %>% 
  head(10)
## # A tibble: 10 × 1
##    word 
##    <chr>
##  1 for  
##  2 dey  
##  3 say  
##  4 wey  
##  5 di   
##  6 the  
##  7 and  
##  8 dem  
##  9 to   
## 10 of
Piding %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE) %>% 
  mutate(mean_frequency = n/nrow(Piding)) %>% 
  mutate(rank_freq = row_number()) %>% 
  filter(n >2) %>% 
  select(word)%>% 
  anti_join(dic) %>% 
  head(10)
## Joining, by = "word"
## # A tibble: 10 × 1
##    word   
##    <chr>  
##  1 pipo   
##  2 nigeria
##  3 tok    
##  4 wetin  
##  5 buhari 
##  6 goment 
##  7 comot  
##  8 pikin  
##  9 pesin  
## 10 apc