9.2 9.1 Pre-processing
library(dplyr)
library(tidyr)
library(purrr)
library(readr)
library(ggplot2)
<- "data/20news-bydate/20news-bydate-train/"
training_folder
# read all files in the folder into a df
<- function(infolder) {
read_folder tibble(file = dir(infolder, full.names = TRUE)) %>%
mutate(text = map(file, read_lines)) %>%
transmute(id = basename(file), text) %>%
unnest(text)
}
# apply the read_folder function over each subfoler in the training dir
<- tibble(folder = dir(training_folder, full.names = TRUE)) %>%
raw_text mutate(folder_out = map(folder, read_folder)) %>%
# flatten folder_out list col of dataframes into regular dataframe
unnest(cols = c(folder_out)) %>%
# create three new vars: newsgroup from the base of the subfolder path, id and text
transmute(newsgroup = basename(folder), id, text)
head(raw_text)
Newsgroups have a hierarchy like this: main.sub.subsub. In this dataset there are 20 unique newsgroups
9.2.1 Pre-processing text
There are a lot of noisy characters in the text field, like ‘From:’ or email addresses that will probably not be useful for the analyses so they need to be removed using some RegEx. Empty lines and email signatures will also be filtered out.
library(stringr)
# email signatures
# must occur after the first occurrence of an empty line,
# and before the first occurrence of a line starting with --
<- raw_text %>%
cleaned_text group_by(newsgroup, id) %>%
filter(cumsum(text == "") > 0,
cumsum(str_detect(text, "^--")) == 0) %>%
ungroup()
# N = 364,364
More cleaning:
<- cleaned_text %>%
cleaned_text2 # quotes from other users
filter(str_detect(text, "^[^>]+[A-Za-z\\d]") | text == "",
# anything containing "writes"
!str_detect(text, "writes(:|\\.\\.\\.)$"),
# anything beginning with "in article"
!str_detect(text, "^In article <"),
# two specifically noisy records
!id %in% c(9704, 9985))
# N = 269,838
head(cleaned_text2)
TOKENIZE!
library(tidytext)
<- cleaned_text2 %>%
usenet_words unnest_tokens(word, text) %>%
# remove numbers
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)
# N = 710,438