2.4 Building your own tokenizer

Regex time!

There are two approaches

Split the string up according to some rule.
Extract tokens based on some rule.

2.4.1 Mimick tokenize_characters()

From a string, we can extract the letters one by one.

letter_tokens <- str_extract_all(
  string = "This sentence include 2 numbers and 1 period.",
  pattern = "[:alpha:]{1}"
)
letter_tokens

## [[1]]
##  [1] "T" "h" "i" "s" "s" "e" "n" "t" "e" "n" "c" "e" "i" "n" "c" "l" "u" "d" "e"
## [20] "n" "u" "m" "b" "e" "r" "s" "a" "n" "d" "p" "e" "r" "i" "o" "d"

We have to be careful what we put in the regex

danish_sentence <- "Så mødte han en gammel heks på landevejen"

str_extract_all(danish_sentence, "[:alpha:]")

## [[1]]
##  [1] "S" "å" "m" "ø" "d" "t" "e" "h" "a" "n" "e" "n" "g" "a" "m" "m" "e" "l" "h"
## [20] "e" "k" "s" "p" "å" "l" "a" "n" "d" "e" "v" "e" "j" "e" "n"

str_extract_all(danish_sentence, "[a-zA-Z]")

## [[1]]
##  [1] "S" "m" "d" "t" "e" "h" "a" "n" "e" "n" "g" "a" "m" "m" "e" "l" "h" "e" "k"
## [20] "s" "p" "l" "a" "n" "d" "e" "v" "e" "j" "e" "n"

2.4.2 Allow for hyphenated words in tokenize_words()

Let’s make “fir-tree” a single word token.

One way to do this is to split texts on white space, and dropping punctuations.

str_split("This isn't a sentence with fir-tree.", "[:space:]")

## [[1]]
## [1] "This"      "isn't"     "a"         "sentence"  "with"      "fir-tree."

str_split("This isn't a sentence with fir-tree.", "[:space:]") %>%
  map(~ str_remove_all(.x, "^[:punct:]+|[:punct:]+$"))

## [[1]]
## [1] "This"     "isn't"    "a"        "sentence" "with"     "fir-tree"

Another way is to extract the hyphenated word.

str_extract_all(
  string = "This isn't a sentence with fir-tree.",
  pattern = "[:alpha:]+-[:alpha:]+"
)

## [[1]]
## [1] "fir-tree"

use ? quantifier in regex, to optionally match pattern.

str_extract_all(
  string = "This isn't a sentence with fir-tree.",
  pattern = "[:alpha:]+-?[:alpha:]+"
)

## [[1]]
## [1] "This"     "isn"      "sentence" "with"     "fir-tree"

include ' in the [:alpha:] class

str_extract_all(
  string = "This isn't a sentence with fir-tree.",
  pattern = "[[:alpha:]']+-?[[:alpha:]']+"
)

## [[1]]
## [1] "This"     "isn't"    "sentence" "with"     "fir-tree"

The letter “a” is missing, because the regex so far assumes at least 2 characters. Get around that, by using | to set up a match for one or more [:alpha:]

str_extract_all(
  string = "This isn't a sentence with fir-tree.",
  pattern = "[[:alpha:]']+-?[[:alpha:]']+|[:alpha:]{1}"
)

## [[1]]
## [1] "This"     "isn't"    "a"        "sentence" "with"     "fir-tree"

2.4.3 Character n-gram tokenizer

tokenize_character_ngram <- function(x, n) {
    ngram_loc <- str_locate_all(x, paste0("(?=(\\w{", n, "}))"))
    
    map2(ngram_loc, x, ~str_sub(.y, .x[, 1], .x[, 1] + n - 1))
}

tokenize_character_ngram(the_fir_tree[1:3], n = 3)

## [[1]]
##  [1] "Far" "dow" "own" "the" "for" "ore" "res" "est" "whe" "her" "ere" "the"
## [13] "war" "arm" "sun" "and" "the" "fre" "res" "esh" "air" "mad" "ade" "swe"
## [25] "wee" "eet"
## 
## [[2]]
##  [1] "res" "est" "sti" "tin" "ing" "pla" "lac" "ace" "gre" "rew" "pre" "ret"
## [13] "ett" "tty" "lit" "itt" "ttl" "tle" "fir" "tre" "ree" "and" "yet" "was"
## [25] "not" "hap" "app" "ppy"
## 
## [[3]]
##  [1] "wis" "ish" "she" "hed" "muc" "uch" "tal" "all" "lik" "ike" "its" "com"
## [13] "omp" "mpa" "pan" "ani" "nio" "ion" "ons" "the" "pin" "ine" "nes" "and"
## [25] "fir" "irs" "whi" "hic" "ich" "gre" "rew"