Subsetting

Learning objectives:

  • Select multiple elements from a vector with [
  • Select single elements from a vector with [[ and $
  • Assign to subsets of vectors
  • Use subsetting to expand data

Selecting multiple elements

1. Positive integers return elements at specified positions

x <- c(1.1, 2.2, 3.3, 4.4) # decimal = original position
x
#> [1] 1.1 2.2 3.3 4.4
x[c(4, 1)]
#> [1] 4.4 1.1
x[c(1, 1, 1)]
#> [1] 1.1 1.1 1.1
x[c(1.9999)]
#> [1] 1.1

Reals truncate to integers.

x[c(1.0001, 1.9999)]
#> [1] 1.1 1.1

2. Negative integers remove specified elements

x[-c(1, 3)] # same as x[c(-1, -3)] or x[c(2, 4)]
#> [1] 2.2 4.4

2b. Mixing negative and positive integers throws an error

x[c(-1, 3)]
#> Error in x[c(-1, 3)]: only 0's may be mixed with negative subscripts

2c. Zeros ignored with other ints

x[c(-1, 0)]
#> [1] 2.2 3.3 4.4
x[c(-1, 0, 0, 0, 0, 0 ,0 ,0)]
#> [1] 2.2 3.3 4.4
x[c(1, 0, 2, 0, 3, 0)]
#> [1] 1.1 2.2 3.3

3. Logical vectors select specified elements

x[c(TRUE, TRUE, FALSE, TRUE)]
#> [1] 1.1 2.2 4.4
x[x < 3]
#> [1] 1.1 2.2
cond <- x > 2.5
x[cond]
#> [1] 3.3 4.4

3b. Shorter element are recycled to higher length

x[FALSE]
#> numeric(0)
x[TRUE]
#> [1] 1.1 2.2 3.3 4.4
x[c(FALSE, TRUE)] # equivalent to: x[c(FALSE, TRUE, FALSE, TRUE)]
#> [1] 2.2 4.4
  • Easy to understand if x or y is 1, best to avoid other lengths

3c. NA index returns NA

x[c(NA, TRUE, NA, TRUE)]
#> [1]  NA 2.2  NA 4.4

3d. Extra TRUE index returns NA

x[c(FALSE, TRUE, TRUE, TRUE, TRUE, TRUE)]
#> [1] 2.2 3.3 4.4  NA  NA
x[1:5]
#> [1] 1.1 2.2 3.3 4.4  NA

4. Indexing with nothing returns original vector

x[]
#> [1] 1.1 2.2 3.3 4.4

5. Indexing with just 0 returns 0-length vector (with class)

x[0]
#> numeric(0)
letters[0]
#> character(0)

6. Indexing with character vector returns element of named vector

(y <- setNames(x, letters[1:4]))
#>   a   b   c   d 
#> 1.1 2.2 3.3 4.4
y[c("d", "b", "a")]
#>   d   b   a 
#> 4.4 2.2 1.1
y[c("a", "a", "a")]
#>   a   a   a 
#> 1.1 1.1 1.1

6b. Names must be exact for [

z <- c(abc = 1, def = 2)
z
#> abc def 
#>   1   2
z[c("a", "d")]
#> <NA> <NA> 
#>   NA   NA

Subsetting a list with [ returns a list

my_list <- list(a = c(T, F), b = letters[5:15], c = 100:108)
my_list
#> $a
#> [1]  TRUE FALSE
#> 
#> $b
#>  [1] "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o"
#> 
#> $c
#> [1] 100 101 102 103 104 105 106 107 108
my_list[c("a", "b")]
#> $a
#> [1]  TRUE FALSE
#> 
#> $b
#>  [1] "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o"

Lists use same rules for [

my_list[2:3]
#> $b
#>  [1] "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o"
#> 
#> $c
#> [1] 100 101 102 103 104 105 106 107 108
my_list[c(TRUE, FALSE, TRUE)]
#> $a
#> [1]  TRUE FALSE
#> 
#> $c
#> [1] 100 101 102 103 104 105 106 107 108

Matrices & arrays take multidimensional indices

a <- matrix(1:9, nrow = 3)
a
#>      [,1] [,2] [,3]
#> [1,]    1    4    7
#> [2,]    2    5    8
#> [3,]    3    6    9
a[1:2, 2:3] # rows, columns
#>      [,1] [,2]
#> [1,]    4    7
#> [2,]    5    8

Matrices & arrays can accept character, logical, etc

colnames(a) <- c("A", "B", "C")
a[c(TRUE, TRUE, FALSE), c("B", "A")] # a[1:2, 2:1]
#>      B A
#> [1,] 4 1
#> [2,] 5 2

Matrices & arrays are also vectors

vals <- outer(1:5, 1:5, FUN = "paste", sep = ",") # All chr combos of 1:5
vals
#>      [,1]  [,2]  [,3]  [,4]  [,5] 
#> [1,] "1,1" "1,2" "1,3" "1,4" "1,5"
#> [2,] "2,1" "2,2" "2,3" "2,4" "2,5"
#> [3,] "3,1" "3,2" "3,3" "3,4" "3,5"
#> [4,] "4,1" "4,2" "4,3" "4,4" "4,5"
#> [5,] "5,1" "5,2" "5,3" "5,4" "5,5"
vals[c(4, 15)]
#> [1] "4,1" "5,3"
a[a > 5]
#> [1] 6 7 8 9

Data frames subset list-like with single index

df <- data.frame(x = 1:3, y = 3:1, z = letters[1:3])
df[1:2]
#>   x y
#> 1 1 3
#> 2 2 2
#> 3 3 1
df[c("x", "z")]
#>   x z
#> 1 1 a
#> 2 2 b
#> 3 3 c

Data frames subset matrix-like with multiple indices

df[1:2, c("x", "z")] # rows, columns
#>   x z
#> 1 1 a
#> 2 2 b
df[df$x == 2, ] # matching rows, all columns
#>   x y z
#> 2 2 2 b
df[, c("x", "z")] # equivalent to no ,
#>   x z
#> 1 1 a
#> 2 2 b
#> 3 3 c

Subsetting a tibble with [ returns a tibble

tbl <- tibble::as_tibble(df)
df[, 1]
#> [1] 1 2 3
df[, 1, drop = FALSE] # Prevent errors
#>   x
#> 1 1
#> 2 2
#> 3 3
tbl[, 1]
#> # A tibble: 3 × 1
#>       x
#>   <int>
#> 1     1
#> 2     2
#> 3     3

Selecting a single element

[[ selects a single element

x <- list(1:3, "a", 4:6)
x[1]
#> [[1]]
#> [1] 1 2 3
class(x[1])
#> [1] "list"
x[[1]]
#> [1] 1 2 3
class(x[[1]])
#> [1] "integer"
x[[1]][[1]]
#> [1] 1

$ is shorthand for [[..., exact = FALSE]]

x <- list(abc = 1)
x$abc
#> [1] 1
x$a
#> [1] 1
x[["a"]]
#> NULL
x[["a", exact = FALSE]]
#> [1] 1
options(warnPartialMatchDollar = TRUE)
x$a
#> Warning in x$a: partial match of 'a' to 'abc'
#> [1] 1

Behavior for missing-ish indices is inconsistent

a <- c(a = 1L, b = 2L)
lst <- list(a = 1:2)

# Errors:
# a[[NULL]]
# lst[[NULL]]
# a[[5]]
# lst[[5]]
# a[["c"]]
# a[[NA]]

lst[["c"]]
#> NULL
lst[[NA]]
#> NULL

purrr::pluck() and purrr::chuck() provide consistent wrappers

  • purrr::pluck() always returns NULL or .default for (non-NULL) missing
  • purrr::chuck() always throws error
purrr::pluck(a, 5)
#> NULL
purrr::pluck(a, "c")
#> NULL
purrr::pluck(lst, 5)
#> NULL
purrr::pluck(lst, "c")
#> NULL

S4 has two additional subsetting operators

  • @ equivalent to $ (but error if bad)
  • slot() equivalent to [[

More in Chapter 15

Subsetting and assignment

Can assign to position with [

x <- 1:5
x[1:2] <- c(101, 102)
x
#> [1] 101 102   3   4   5
x[1:3] <- 1:2
x
#> [1] 1 2 1 4 5

Remove list component with NULL

x <- list(a = 1, b = 2)
x[["b"]] <- NULL
x
#> $a
#> [1] 1

Use list(NULL) to add NULL

x <- list(a = 1, b = 2)
x[["b"]] <- list(NULL)
x
#> $a
#> [1] 1
#> 
#> $b
#> $b[[1]]
#> NULL

Subset with nothing to retain shape

df <- data.frame(a = 1:3, b = 1:3)
df[] <- "a"
df
#>   a b
#> 1 a a
#> 2 a a
#> 3 a a
df <- "a"
df
#> [1] "a"

Applications

Use a lookup vector and recycling rules to translate values

x <- c("b", "g", "x", "g", "g", "b")
lookup <- c(b = "blue", g = "green", x = NA)
lookup[x]
#>       b       g       x       g       g       b 
#>  "blue" "green"      NA "green" "green"  "blue"
unname(lookup[x])
#> [1] "blue"  "green" NA      "green" "green" "blue"

Use a lookup table to generate rows of data

info <- data.frame(
  code = c("b", "g", "x"),
  color = c("blue", "green", NA),
  other_thing = 3:1
)
match(x, info$code) # Indices of info$code in x
#> [1] 1 2 3 2 2 1
info[match(x, info$code), ]
#>     code color other_thing
#> 1      b  blue           3
#> 2      g green           2
#> 3      x  <NA>           1
#> 2.1    g green           2
#> 2.2    g green           2
#> 1.1    b  blue           3

Sort with order()

x <- c("b", "c", "a")
order(x)
#> [1] 3 1 2
x[order(x)]
#> [1] "a" "b" "c"
df <- data.frame(b = 3:1, a = 1:3)
df[order(df$b), ]
#>   b a
#> 3 1 3
#> 2 2 2
#> 1 3 1
df[, order(names(df))]
#>   a b
#> 1 1 3
#> 2 2 2
#> 3 3 1

Expand counts

df <- data.frame(x = c(2, 4, 1), y = c(9, 11, 6), n = c(3, 5, 1))
rep(1:nrow(df), df$n)
#> [1] 1 1 1 2 2 2 2 2 3
df[rep(1:nrow(df), df$n), ]
#>     x  y n
#> 1   2  9 3
#> 1.1 2  9 3
#> 1.2 2  9 3
#> 2   4 11 5
#> 2.1 4 11 5
#> 2.2 4 11 5
#> 2.3 4 11 5
#> 2.4 4 11 5
#> 3   1  6 1

Ran out of time to make slides for

Ideally a future cohort should expand these:

  • Remove df columns with setdiff()
  • Logically subset rows df[df$col > 5, ]
  • The next slide about which()

Boolean algebra versus sets (logical and integer)

  • which() gives the indices of a Boolean vector
(x1 <- 1:10 %% 2 == 0) # 1-10 divisible by 2
#  [1] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
(x2 <- which(x1))
# [1]  2  4  6  8 10
(y1 <- 1:10 %% 5 == 0) # 1-10 divisible by 5
#  [1] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
(y2 <- which(y1))
# [1]  5 10
x1 & y1
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE