Subsetting

Learning objectives

Select multiple elements from a vector with [
Select single elements from a vector with [[ and $
Assign to subsets of vectors
Use subsetting to expand data

Selecting multiple elements

1. Positive integers return elements at specified positions

x <- c(1.1, 2.2, 3.3, 4.4) # decimal = original position
x

#> [1] 1.1 2.2 3.3 4.4

x[c(4, 1)]

#> [1] 4.4 1.1

x[c(1, 1, 1)]

#> [1] 1.1 1.1 1.1

x[c(1.9999)]

#> [1] 1.1

Reals truncate to integers.

x[c(1.0001, 1.9999)]

#> [1] 1.1 1.1

2. Negative integers remove specified elements

x[-c(1, 3)] # same as x[c(-1, -3)] or x[c(2, 4)]

#> [1] 2.2 4.4

2b. Mixing negative and positive integers throws an error

x[c(-1, 3)]

#> Error in x[c(-1, 3)]: only 0's may be mixed with negative subscripts

2c. Zeros ignored with other ints

x[c(-1, 0)]

#> [1] 2.2 3.3 4.4

x[c(-1, 0, 0, 0, 0, 0 ,0 ,0)]

#> [1] 2.2 3.3 4.4

x[c(1, 0, 2, 0, 3, 0)]

#> [1] 1.1 2.2 3.3

3. Logical vectors select specified elements

x[c(TRUE, TRUE, FALSE, TRUE)]

#> [1] 1.1 2.2 4.4

x[x < 3]

#> [1] 1.1 2.2

cond <- x > 2.5
x[cond]

#> [1] 3.3 4.4

3b. Shorter element are recycled to higher length

x[FALSE]

#> numeric(0)

x[TRUE]

#> [1] 1.1 2.2 3.3 4.4

x[c(FALSE, TRUE)] # equivalent to: x[c(FALSE, TRUE, FALSE, TRUE)]

#> [1] 2.2 4.4

Easy to understand if x or y is 1, best to avoid other lengths

3c. NA index returns NA

x[c(NA, TRUE, NA, TRUE)]

#> [1]  NA 2.2  NA 4.4

3d. Extra TRUE index returns NA

x[c(FALSE, TRUE, TRUE, TRUE, TRUE, TRUE)]

#> [1] 2.2 3.3 4.4  NA  NA

x[1:5]

#> [1] 1.1 2.2 3.3 4.4  NA

4. Indexing with nothing returns original vector

x[]

#> [1] 1.1 2.2 3.3 4.4

5. Indexing with just 0 returns 0-length vector (with class)

x[0]

#> numeric(0)

letters[0]

#> character(0)

6. Indexing with character vector returns element of named vector

(y <- setNames(x, letters[1:4]))

#>   a   b   c   d 
#> 1.1 2.2 3.3 4.4

y[c("d", "b", "a")]

#>   d   b   a 
#> 4.4 2.2 1.1

y[c("a", "a", "a")]

#>   a   a   a 
#> 1.1 1.1 1.1

6b. Names must be exact for `[`

z <- c(abc = 1, def = 2)
z

#> abc def 
#>   1   2

z[c("a", "d")]

#> <NA> <NA> 
#>   NA   NA

Subsetting a list with `[` returns a list

my_list <- list(a = c(T, F), b = letters[5:15], c = 100:108)
my_list

#> $a
#> [1]  TRUE FALSE
#> 
#> $b
#>  [1] "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o"
#> 
#> $c
#> [1] 100 101 102 103 104 105 106 107 108

my_list[c("a", "b")]

#> $a
#> [1]  TRUE FALSE
#> 
#> $b
#>  [1] "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o"

Lists use same rules for `[`

my_list[2:3]

#> $b
#>  [1] "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o"
#> 
#> $c
#> [1] 100 101 102 103 104 105 106 107 108

my_list[c(TRUE, FALSE, TRUE)]

#> $a
#> [1]  TRUE FALSE
#> 
#> $c
#> [1] 100 101 102 103 104 105 106 107 108

Matrices & arrays take multidimensional indices

a <- matrix(1:9, nrow = 3)
a

#>      [,1] [,2] [,3]
#> [1,]    1    4    7
#> [2,]    2    5    8
#> [3,]    3    6    9

a[1:2, 2:3] # rows, columns

#>      [,1] [,2]
#> [1,]    4    7
#> [2,]    5    8

Matrices & arrays can accept character, logical, etc

colnames(a) <- c("A", "B", "C")
a[c(TRUE, TRUE, FALSE), c("B", "A")] # a[1:2, 2:1]

#>      B A
#> [1,] 4 1
#> [2,] 5 2

Matrices & arrays are also vectors

vals <- outer(1:5, 1:5, FUN = "paste", sep = ",") # All chr combos of 1:5
vals

#>      [,1]  [,2]  [,3]  [,4]  [,5] 
#> [1,] "1,1" "1,2" "1,3" "1,4" "1,5"
#> [2,] "2,1" "2,2" "2,3" "2,4" "2,5"
#> [3,] "3,1" "3,2" "3,3" "3,4" "3,5"
#> [4,] "4,1" "4,2" "4,3" "4,4" "4,5"
#> [5,] "5,1" "5,2" "5,3" "5,4" "5,5"

vals[c(4, 15)]

#> [1] "4,1" "5,3"

a[a > 5]

#> [1] 6 7 8 9

Data frames subset list-like with single index

df <- data.frame(x = 1:3, y = 3:1, z = letters[1:3])
df[1:2]

#>   x y
#> 1 1 3
#> 2 2 2
#> 3 3 1

df[c("x", "z")]

#>   x z
#> 1 1 a
#> 2 2 b
#> 3 3 c

Data frames subset matrix-like with multiple indices

df[1:2, c("x", "z")] # rows, columns

#>   x z
#> 1 1 a
#> 2 2 b

df[df$x == 2, ] # matching rows, all columns

#>   x y z
#> 2 2 2 b

df[, c("x", "z")] # equivalent to no ,

#>   x z
#> 1 1 a
#> 2 2 b
#> 3 3 c

Subsetting a tibble with `[` returns a tibble

tbl <- tibble::as_tibble(df)
df[, 1]

#> [1] 1 2 3

df[, 1, drop = FALSE] # Prevent errors

#>   x
#> 1 1
#> 2 2
#> 3 3

tbl[, 1]

#> # A tibble: 3 × 1
#>       x
#>   <int>
#> 1     1
#> 2     2
#> 3     3

Selecting a single element

`[[` selects a single element

x <- list(1:3, "a", 4:6)
x[1]

#> [[1]]
#> [1] 1 2 3

class(x[1])

#> [1] "list"

x[[1]]

#> [1] 1 2 3

class(x[[1]])

#> [1] "integer"

x[[1]][[1]]

#> [1] 1

`$` is shorthand for `[[..., exact = FALSE]]`

x <- list(abc = 1)
x$abc

#> [1] 1

x$a

#> [1] 1

x[["a"]]

#> NULL

x[["a", exact = FALSE]]

#> [1] 1

options(warnPartialMatchDollar = TRUE)
x$a

#> Warning in x$a: partial match of 'a' to 'abc'

#> [1] 1

Behavior for missing-ish indices is inconsistent

a <- c(a = 1L, b = 2L)
lst <- list(a = 1:2)

# Errors:
# a[[NULL]]
# lst[[NULL]]
# a[[5]]
# lst[[5]]
# a[["c"]]
# a[[NA]]

lst[["c"]]

#> NULL

lst[[NA]]

#> NULL

`purrr::pluck()` and `purrr::chuck()` provide consistent wrappers

purrr::pluck() always returns NULL or .default for (non-NULL) missing
purrr::chuck() always throws error

purrr::pluck(a, 5)

#> NULL

purrr::pluck(a, "c")

#> NULL

purrr::pluck(lst, 5)

#> NULL

purrr::pluck(lst, "c")

#> NULL

S4 has two additional subsetting operators

@ equivalent to $ (but error if bad)
slot() equivalent to [[

Subsetting and assignment

Can assign to position with `[`

x <- 1:5
x[1:2] <- c(101, 102)
x

#> [1] 101 102   3   4   5

x[1:3] <- 1:2
x

#> [1] 1 2 1 4 5

Remove list component with `NULL`

x <- list(a = 1, b = 2)
x[["b"]] <- NULL
x

#> $a
#> [1] 1

Use `list(NULL)` to add `NULL`

x <- list(a = 1, b = 2)
x[["b"]] <- list(NULL)
x

#> $a
#> [1] 1
#> 
#> $b
#> $b[[1]]
#> NULL

Subset with nothing to retain shape

df <- data.frame(a = 1:3, b = 1:3)
df[] <- "a"
df

#>   a b
#> 1 a a
#> 2 a a
#> 3 a a

df <- "a"
df

#> [1] "a"

Applications

Use a lookup vector and recycling rules to translate values

x <- c("b", "g", "x", "g", "g", "b")
lookup <- c(b = "blue", g = "green", x = NA)
lookup[x]

#>       b       g       x       g       g       b 
#>  "blue" "green"      NA "green" "green"  "blue"

unname(lookup[x])

#> [1] "blue"  "green" NA      "green" "green" "blue"

Use a lookup table to generate rows of data

info <- data.frame(
  code = c("b", "g", "x"),
  color = c("blue", "green", NA),
  other_thing = 3:1
)
match(x, info$code) # Indices of info$code in x

#> [1] 1 2 3 2 2 1

info[match(x, info$code), ]

#>     code color other_thing
#> 1      b  blue           3
#> 2      g green           2
#> 3      x  <NA>           1
#> 2.1    g green           2
#> 2.2    g green           2
#> 1.1    b  blue           3

Sort with `order()`

x <- c("b", "c", "a")
order(x)

#> [1] 3 1 2

x[order(x)]

#> [1] "a" "b" "c"

df <- data.frame(b = 3:1, a = 1:3)
df[order(df$b), ]

#>   b a
#> 3 1 3
#> 2 2 2
#> 1 3 1

df[, order(names(df))]

#>   a b
#> 1 1 3
#> 2 2 2
#> 3 3 1

Expand counts

df <- data.frame(x = c(2, 4, 1), y = c(9, 11, 6), n = c(3, 5, 1))
rep(1:nrow(df), df$n)

#> [1] 1 1 1 2 2 2 2 2 3

df[rep(1:nrow(df), df$n), ]

#>     x  y n
#> 1   2  9 3
#> 1.1 2  9 3
#> 1.2 2  9 3
#> 2   4 11 5
#> 2.1 4 11 5
#> 2.2 4 11 5
#> 2.3 4 11 5
#> 2.4 4 11 5
#> 3   1  6 1

Ran out of time to make slides for

Ideally a future cohort should expand these:

Remove df columns with setdiff()
Logically subset rows df[df$col > 5, ]
The next slide about which()

Boolean algebra versus sets (logical and integer)

which() gives the indices of a Boolean vector

(x1 <- 1:10 %% 2 == 0) # 1-10 divisible by 2
#  [1] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
(x2 <- which(x1))
# [1]  2  4  6  8 10
(y1 <- 1:10 %% 5 == 0) # 1-10 divisible by 5
#  [1] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
(y2 <- which(y1))
# [1]  5 10
x1 & y1
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE

Subsetting

Learning objectives

Selecting multiple elements

1. Positive integers return elements at specified positions

2. Negative integers remove specified elements

2b. Mixing negative and positive integers throws an error

2c. Zeros ignored with other ints

3. Logical vectors select specified elements

3b. Shorter element are recycled to higher length

3c. NA index returns NA

3d. Extra TRUE index returns NA

4. Indexing with nothing returns original vector

5. Indexing with just 0 returns 0-length vector (with class)

6. Indexing with character vector returns element of named vector

6b. Names must be exact for [

Subsetting a list with [ returns a list

Lists use same rules for [

Matrices & arrays take multidimensional indices

Matrices & arrays can accept character, logical, etc

Matrices & arrays are also vectors

Data frames subset list-like with single index

Data frames subset matrix-like with multiple indices

Subsetting a tibble with [ returns a tibble

Selecting a single element

[[ selects a single element

$ is shorthand for [[..., exact = FALSE]]

Behavior for missing-ish indices is inconsistent

purrr::pluck() and purrr::chuck() provide consistent wrappers

S4 has two additional subsetting operators

Subsetting and assignment

Can assign to position with [

Remove list component with NULL

Use list(NULL) to add NULL

Subset with nothing to retain shape

Applications

Use a lookup vector and recycling rules to translate values

Use a lookup table to generate rows of data

Sort with order()

Expand counts

Ran out of time to make slides for

Boolean algebra versus sets (logical and integer)

6b. Names must be exact for `[`

Subsetting a list with `[` returns a list

Lists use same rules for `[`

Subsetting a tibble with `[` returns a tibble

`[[` selects a single element

`$` is shorthand for `[[..., exact = FALSE]]`

`purrr::pluck()` and `purrr::chuck()` provide consistent wrappers

Can assign to position with `[`

Remove list component with `NULL`

Use `list(NULL)` to add `NULL`

Sort with `order()`