2.4 Building your own tokenizer
Regex time!
There are two approaches
- Split the string up according to some rule.
- Extract tokens based on some rule.
2.4.1 Mimick tokenize_characters()
From a string, we can extract the letters one by one.
letter_tokens <- str_extract_all(
string = "This sentence include 2 numbers and 1 period.",
pattern = "[:alpha:]{1}"
)
letter_tokens
## [[1]]
## [1] "T" "h" "i" "s" "s" "e" "n" "t" "e" "n" "c" "e" "i" "n" "c" "l" "u" "d" "e"
## [20] "n" "u" "m" "b" "e" "r" "s" "a" "n" "d" "p" "e" "r" "i" "o" "d"
We have to be careful what we put in the regex
danish_sentence <- "Så mødte han en gammel heks på landevejen"
str_extract_all(danish_sentence, "[:alpha:]")
## [[1]]
## [1] "S" "å" "m" "ø" "d" "t" "e" "h" "a" "n" "e" "n" "g" "a" "m" "m" "e" "l" "h"
## [20] "e" "k" "s" "p" "å" "l" "a" "n" "d" "e" "v" "e" "j" "e" "n"
## [[1]]
## [1] "S" "m" "d" "t" "e" "h" "a" "n" "e" "n" "g" "a" "m" "m" "e" "l" "h" "e" "k"
## [20] "s" "p" "l" "a" "n" "d" "e" "v" "e" "j" "e" "n"
2.4.2 Allow for hyphenated words in tokenize_words()
Let’s make “fir-tree” a single word token.
One way to do this is to split texts on white space, and dropping punctuations.
## [[1]]
## [1] "This" "isn't" "a" "sentence" "with" "fir-tree."
str_split("This isn't a sentence with fir-tree.", "[:space:]") %>%
map(~ str_remove_all(.x, "^[:punct:]+|[:punct:]+$"))
## [[1]]
## [1] "This" "isn't" "a" "sentence" "with" "fir-tree"
Another way is to extract the hyphenated word.
str_extract_all(
string = "This isn't a sentence with fir-tree.",
pattern = "[:alpha:]+-[:alpha:]+"
)
## [[1]]
## [1] "fir-tree"
use ? quantifier in regex, to optionally match pattern.
str_extract_all(
string = "This isn't a sentence with fir-tree.",
pattern = "[:alpha:]+-?[:alpha:]+"
)
## [[1]]
## [1] "This" "isn" "sentence" "with" "fir-tree"
include '
in the [:alpha:] class
str_extract_all(
string = "This isn't a sentence with fir-tree.",
pattern = "[[:alpha:]']+-?[[:alpha:]']+"
)
## [[1]]
## [1] "This" "isn't" "sentence" "with" "fir-tree"
The letter “a” is missing, because the regex so far assumes at least 2 characters. Get around that, by using |
to set up a match for one or more [:alpha:]
str_extract_all(
string = "This isn't a sentence with fir-tree.",
pattern = "[[:alpha:]']+-?[[:alpha:]']+|[:alpha:]{1}"
)
## [[1]]
## [1] "This" "isn't" "a" "sentence" "with" "fir-tree"
2.4.3 Character n-gram tokenizer
tokenize_character_ngram <- function(x, n) {
ngram_loc <- str_locate_all(x, paste0("(?=(\\w{", n, "}))"))
map2(ngram_loc, x, ~str_sub(.y, .x[, 1], .x[, 1] + n - 1))
}
tokenize_character_ngram(the_fir_tree[1:3], n = 3)
## [[1]]
## [1] "Far" "dow" "own" "the" "for" "ore" "res" "est" "whe" "her" "ere" "the"
## [13] "war" "arm" "sun" "and" "the" "fre" "res" "esh" "air" "mad" "ade" "swe"
## [25] "wee" "eet"
##
## [[2]]
## [1] "res" "est" "sti" "tin" "ing" "pla" "lac" "ace" "gre" "rew" "pre" "ret"
## [13] "ett" "tty" "lit" "itt" "ttl" "tle" "fir" "tre" "ree" "and" "yet" "was"
## [25] "not" "hap" "app" "ppy"
##
## [[3]]
## [1] "wis" "ish" "she" "hed" "muc" "uch" "tal" "all" "lik" "ike" "its" "com"
## [13] "omp" "mpa" "pan" "ani" "nio" "ion" "ons" "the" "pin" "ine" "nes" "and"
## [25] "fir" "irs" "whi" "hic" "ich" "gre" "rew"