4.1 Selecting multiple elements

4.1.1 Atomic Vectors

  • 6 ways to subset atomic vectors

Let’s take a look with an example vector.

x <- c(1.1, 2.2, 3.3, 4.4)

Positive integer indices

# return elements at specified positions which can be out of order
x[c(4, 1)]
#> [1] 4.4 1.1

# duplicate indices return duplicate values
x[c(2, 2)]
#> [1] 2.2 2.2

# real numbers truncate to integers
# so this behaves as if it is x[c(3, 3)]
x[c(3.2, 3.8)]
#> [1] 3.3 3.3

Negative integer indices

### excludes elements at specified positions
x[-c(1, 3)] # same as x[c(-1, -3)] or x[c(2, 4)]
#> [1] 2.2 4.4

### mixing positive and negative is a no-no
x[c(-1, 3)]
#> Error in x[c(-1, 3)]: only 0's may be mixed with negative subscripts

Logical Vectors

x[c(TRUE, TRUE, FALSE, TRUE)]
#> [1] 1.1 2.2 4.4

x[x < 3]
#> [1] 1.1 2.2

cond <- x > 2.5
x[cond]
#> [1] 3.3 4.4
  • Recyling rules applies when the two vectors are of different lengths
  • the shorter of the two is recycled to the length of the longer
  • Easy to understand if x or y is 1, best to avoid other lengths
x[c(F, T)] # equivalent to: x[c(FALSE, TRUE, FALSE, TRUE)]
#> [1] 2.2 4.4

Missing values (NA)

# Missing values in index will also return NA in output
x[c(NA, TRUE)]
#> [1]  NA 2.2  NA 4.4

Nothing

# returns the original vector
x[]
#> [1] 1.1 2.2 3.3 4.4

Zero

# returns a zero-length vector
x[0]
#> numeric(0)

Character vectors

# if name, you can use to return matched elements
(y <- setNames(x, letters[1:4]))
#>   a   b   c   d 
#> 1.1 2.2 3.3 4.4

y[c("d", "b", "a")]
#>   d   b   a 
#> 4.4 2.2 1.1

# Like integer indices, you can repeat indices
y[c("a", "a", "a")]
#>   a   a   a 
#> 1.1 1.1 1.1

# When subsetting with [, names are always matched exactly
z <- c(abc = 1, def = 2)
z
#> abc def 
#>   1   2
z[c("a", "d")]
#> <NA> <NA> 
#>   NA   NA

4.1.2 Lists

  • Subsetting works the same way
  • [ always returns a list
  • [[ and $ let you pull elements out of a list
my_list <- list(a = c(T, F), b = letters[5:15], c = 100:108)
my_list
#> $a
#> [1]  TRUE FALSE
#> 
#> $b
#>  [1] "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o"
#> 
#> $c
#> [1] 100 101 102 103 104 105 106 107 108

Return a (named) list

l1 <- my_list[2]
l1
#> $b
#>  [1] "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o"

Return a vector

l2 <- my_list[[2]]
l2
#>  [1] "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o"
l2b <- my_list$b
l2b
#>  [1] "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o"

Return a specific element

l3 <- my_list[[2]][3]
l3
#> [1] "g"
l4 <- my_list[['b']][3]
l4
#> [1] "g"
l4b <- my_list$b[3]
l4b
#> [1] "g"

Visual Representation

See this stackoverflow article for more detailed information about the differences: https://stackoverflow.com/questions/1169456/the-difference-between-bracket-and-double-bracket-for-accessing-the-el

4.1.3 Matrices and arrays

You can subset higher dimensional structures in three ways:

  • with multiple vectors
  • with a single vector
  • with a matrix
a <- matrix(1:12, nrow = 3)
colnames(a) <- c("A", "B", "C", "D")

# single row
a[1, ]
#>  A  B  C  D 
#>  1  4  7 10

# single column
a[, 1]
#> [1] 1 2 3

# single element
a[1, 1]
#> A 
#> 1

# two rows from two columns
a[1:2, 3:4]
#>      C  D
#> [1,] 7 10
#> [2,] 8 11

a[c(TRUE, FALSE, TRUE), c("B", "A")]
#>      B A
#> [1,] 4 1
#> [2,] 6 3

# zero index and negative index
a[0, -2]
#>      A C D

Subset a matrix with a matrix

b <- matrix(1:4, nrow = 2)
b
#>      [,1] [,2]
#> [1,]    1    3
#> [2,]    2    4
a[b]
#> [1]  7 11
vals <- outer(1:5, 1:5, FUN = "paste", sep = ",")
vals
#>      [,1]  [,2]  [,3]  [,4]  [,5] 
#> [1,] "1,1" "1,2" "1,3" "1,4" "1,5"
#> [2,] "2,1" "2,2" "2,3" "2,4" "2,5"
#> [3,] "3,1" "3,2" "3,3" "3,4" "3,5"
#> [4,] "4,1" "4,2" "4,3" "4,4" "4,5"
#> [5,] "5,1" "5,2" "5,3" "5,4" "5,5"

select <- matrix(ncol = 2, byrow = TRUE, 
                 c(1, 1,
                   3, 1,
                   2, 4))
select
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    3    1
#> [3,]    2    4

vals[select]
#> [1] "1,1" "3,1" "2,4"

Matrices and arrays are just special vectors; can subset with a single vector (arrays in R stored column wise)

vals[c(3, 15, 16, 17)]
#> [1] "3,1" "5,3" "1,4" "2,4"

4.1.4 Data frames and tibbles

Data frames act like both lists and matrices

  • When subsetting with a single index, they behave like lists and index the columns, so df[1:2] selects the first two columns.
  • When subsetting with two indices, they behave like matrices, so df[1:3, ] selects the first three rows (and all the columns).
library(palmerpenguins)
penguins <- penguins

# single index selects first two columns
two_cols <- penguins[2:3] # or penguins[c(2,3)]
head(two_cols)
#> # A tibble: 6 × 2
#>   island    bill_length_mm
#>   <fct>              <dbl>
#> 1 Torgersen           39.1
#> 2 Torgersen           39.5
#> 3 Torgersen           40.3
#> 4 Torgersen           NA  
#> 5 Torgersen           36.7
#> 6 Torgersen           39.3

# equivalent to the above code
same_two_cols <- penguins[c("island", "bill_length_mm")]
head(same_two_cols)
#> # A tibble: 6 × 2
#>   island    bill_length_mm
#>   <fct>              <dbl>
#> 1 Torgersen           39.1
#> 2 Torgersen           39.5
#> 3 Torgersen           40.3
#> 4 Torgersen           NA  
#> 5 Torgersen           36.7
#> 6 Torgersen           39.3

# two indices separated by comma (first two rows of 3rd and 4th columns)
penguins[1:2, 3:4]
#> # A tibble: 2 × 2
#>   bill_length_mm bill_depth_mm
#>            <dbl>         <dbl>
#> 1           39.1          18.7
#> 2           39.5          17.4

# Can't do this...
penguins[[3:4]][c(1:4)]
#> Error:
#> ! The `j` argument of `[[.tbl_df()` can't be a vector of length 2 as of
#>   tibble 3.0.0.
#> ℹ Recursive subsetting is deprecated for tibbles.
# ...but this works...
penguins[[3]][c(1:4)]
#> [1] 39.1 39.5 40.3   NA
# ...or this equivalent...
penguins$bill_length_mm[1:4]
#> [1] 39.1 39.5 40.3   NA

Subsetting a tibble with [ always returns a tibble

4.1.5 Preserving dimensionality

  • Data frames and tibbles behave differently
  • tibble will default to preserve dimensionality, data frames do not
  • this can lead to unexpected behavior and code breaking in the future
  • Use drop = FALSE to preserve dimensionality when subsetting a data frame or use tibbles
tb <- tibble::tibble(a = 1:2, b = 1:2)

# returns tibble
str(tb[, "a"])
#> tibble [2 × 1] (S3: tbl_df/tbl/data.frame)
#>  $ a: int [1:2] 1 2
tb[, "a"] # equivalent to tb[, "a", drop = FALSE]
#> # A tibble: 2 × 1
#>       a
#>   <int>
#> 1     1
#> 2     2

# returns integer vector
# str(tb[, "a", drop = TRUE])
tb[, "a", drop = TRUE]
#> [1] 1 2
df <- data.frame(a = 1:2, b = 1:2)

# returns integer vector
# str(df[, "a"])
df[, "a"]
#> [1] 1 2

# returns data frame with one column
# str(df[, "a", drop = FALSE])
df[, "a", drop = FALSE]
#>   a
#> 1 1
#> 2 2

Factors

Factor subsetting drop argument controls whether or not levels (rather than dimensions) are preserved.

z <- factor(c("a", "b", "c"))
z[1]
#> [1] a
#> Levels: a b c
z[1, drop = TRUE]
#> [1] a
#> Levels: a