9.2 9.1 Pre-processing

library(dplyr)
library(tidyr)
library(purrr)
library(readr)
library(ggplot2)

training_folder <- "data/20news-bydate/20news-bydate-train/"

# read all files in the folder into a df
read_folder <- function(infolder) {
  tibble(file = dir(infolder, full.names = TRUE)) %>%
    mutate(text = map(file, read_lines)) %>%
    transmute(id = basename(file), text) %>%
    unnest(text)
}

# apply the read_folder function over each subfoler in the training dir

raw_text <- tibble(folder = dir(training_folder, full.names = TRUE)) %>%
  mutate(folder_out = map(folder, read_folder)) %>%
  # flatten folder_out list col of dataframes into regular dataframe
  unnest(cols = c(folder_out)) %>%
  # create three new vars: newsgroup from the base of the subfolder path, id and text
  transmute(newsgroup = basename(folder), id, text)

head(raw_text)

Newsgroups have a hierarchy like this: main.sub.subsub. In this dataset there are 20 unique newsgroups

9.2.1 Pre-processing text

There are a lot of noisy characters in the text field, like ‘From:’ or email addresses that will probably not be useful for the analyses so they need to be removed using some RegEx. Empty lines and email signatures will also be filtered out.

library(stringr)

# email signatures
# must occur after the first occurrence of an empty line,
# and before the first occurrence of a line starting with --
cleaned_text <- raw_text %>%
  group_by(newsgroup, id) %>%
  filter(cumsum(text == "") > 0,
         cumsum(str_detect(text, "^--")) == 0) %>%
  ungroup()

# N = 364,364

More cleaning:

cleaned_text2 <- cleaned_text %>%
  # quotes from other users
  filter(str_detect(text, "^[^>]+[A-Za-z\\d]") | text == "",
         # anything containing "writes"
         !str_detect(text, "writes(:|\\.\\.\\.)$"),
         # anything beginning with "in article"
         !str_detect(text, "^In article <"),
         # two specifically noisy records
         !id %in% c(9704, 9985))
# N = 269,838

head(cleaned_text2)

TOKENIZE!

library(tidytext)

usenet_words <- cleaned_text2 %>%
  unnest_tokens(word, text) %>%
  # remove numbers
  filter(str_detect(word, "[a-z']$"),
         !word %in% stop_words$word)

# N = 710,438