9.2 9.1 Pre-processing
library(dplyr)
library(tidyr)
library(purrr)
library(readr)
library(ggplot2)training_folder <- "data/20news-bydate/20news-bydate-train/"
# read all files in the folder into a df
read_folder <- function(infolder) {
tibble(file = dir(infolder, full.names = TRUE)) %>%
mutate(text = map(file, read_lines)) %>%
transmute(id = basename(file), text) %>%
unnest(text)
}
# apply the read_folder function over each subfoler in the training dir
raw_text <- tibble(folder = dir(training_folder, full.names = TRUE)) %>%
mutate(folder_out = map(folder, read_folder)) %>%
# flatten folder_out list col of dataframes into regular dataframe
unnest(cols = c(folder_out)) %>%
# create three new vars: newsgroup from the base of the subfolder path, id and text
transmute(newsgroup = basename(folder), id, text)head(raw_text)Newsgroups have a hierarchy like this: main.sub.subsub. In this dataset there are 20 unique newsgroups
9.2.1 Pre-processing text
There are a lot of noisy characters in the text field, like ‘From:’ or email addresses that will probably not be useful for the analyses so they need to be removed using some RegEx. Empty lines and email signatures will also be filtered out.
library(stringr)
# email signatures
# must occur after the first occurrence of an empty line,
# and before the first occurrence of a line starting with --
cleaned_text <- raw_text %>%
group_by(newsgroup, id) %>%
filter(cumsum(text == "") > 0,
cumsum(str_detect(text, "^--")) == 0) %>%
ungroup()
# N = 364,364More cleaning:
cleaned_text2 <- cleaned_text %>%
# quotes from other users
filter(str_detect(text, "^[^>]+[A-Za-z\\d]") | text == "",
# anything containing "writes"
!str_detect(text, "writes(:|\\.\\.\\.)$"),
# anything beginning with "in article"
!str_detect(text, "^In article <"),
# two specifically noisy records
!id %in% c(9704, 9985))
# N = 269,838
head(cleaned_text2)TOKENIZE!
library(tidytext)
usenet_words <- cleaned_text2 %>%
unnest_tokens(word, text) %>%
# remove numbers
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)
# N = 710,438