Names and values

Learning objectives

  • Distinguish between an object and its name.
  • Identify when data are copied versus modified.
  • Trace and identify the memory used by R.

The {lobstr} package will help us throughout the chapter

library(lobstr)

Syntactic names are easier to create and work with than non-syntactic names

  • Syntactic names: my_variable, x, cpp11, .by.
    • Can’t use names in ?Reserved
  • Non-syntactic names need to be surrounded in backticks.

Names are bound to values with <-

a <- c(1, 2, 3)
a
#> [1] 1 2 3
obj_addr(a)
#> [1] "0x1624dfae968"

Many names can be bound to the same values

b <- a
obj_addr(a)
#> [1] "0x1624dfae968"
obj_addr(b)
#> [1] "0x1624dfae968"

If shared values are modified, the object is copied to a new address

b[[1]] <- 5
obj_addr(a)
#> [1] "0x1624dfae968"
obj_addr(b)
#> [1] "0x16249d8f228"

Memory addresses can differ even if objects seem the same

a <- 1:10
b <- a
c <- 1:10

obj_addr(a)
#> [1] "0x1624bd3f5e8"
obj_addr(b)
#> [1] "0x1624bd3f5e8"
obj_addr(c)
#> [1] "0x1624bd7a540"

Functions have a single address regardless of how they’re referenced

obj_addr(mean)
#> [1] "0x162498c1738"
obj_addr(base::mean)
#> [1] "0x162498c1738"
obj_addr(get("mean"))
#> [1] "0x162498c1738"

Unlike most objects, environments keep the same memory address on modify

d <- new.env()
obj_addr(d)
#> [1] "0x1624d749900"
e <- d
e[['a']] <- 1
obj_addr(e)
#> [1] "0x1624d749900"
obj_addr(d)
#> [1] "0x1624d749900"
d[['a']]
#> [1] 1

Use tracemem to validate if values are copied or modified

x <- runif(10)
tracemem(x)
#> [1] "<000001F4185B4B08>"
y <- x
x[[1]] <- 10
#> tracemem[0x000001f4185b4b08 -> 0x000001f4185b4218]:
untracemem(x)

tracemem shows internal C code minimizes copying

y <- as.list(x)
tracemem(y)
#> [1] "<000001AD67FDCD38>"
medians <- vapply(x, median, numeric(1))
for (i in 1:5) {
  y[[i]] <- y[[i]] - medians[[i]]
}
#> tracemem[0x000001ad67fdcd38 -> 0x000001ad61982638]:
untracemem(y)

A function’s environment follows copy-on-modify rules

f <- function(a) {
  a
}

x <- c(1, 2, 3)
z <- f(x) # No change in value

obj_addr(x)
#> [1] "0x1624dda26e8"
obj_addr(z) # No address change 
#> [1] "0x1624dda26e8"

ref() shows the memory address of a list and its elements

l1 <- list(1, 2, 3)
obj_addr(l1)
#> [1] "0x1624e315d68"
l2 <- l1
l2[[3]] <- 4
ref(l1, l2)
#> █ [1:0x1624e315d68] <list> 
#> ├─[2:0x1624e77ebd0] <dbl> 
#> ├─[3:0x1624e77ea10] <dbl> 
#> └─[4:0x1624e77e850] <dbl> 
#>  
#> █ [5:0x1624e34ed18] <list> 
#> ├─[2:0x1624e77ebd0] 
#> ├─[3:0x1624e77ea10] 
#> └─[6:0x1624e754e70] <dbl>

Since dataframes are lists of (column) vectors, mutating a column modifies only that column

d1 <- data.frame(x = c(1, 5, 6), y = c(2, 4, 3))
d2 <- d1
d2[, 2] <- d2[, 2] * 2
ref(d1, d2)
#> █ [1:0x1624ec8d348] <df[,2]> 
#> ├─x = [2:0x1624f230148] <dbl> 
#> └─y = [3:0x1624f2300f8] <dbl> 
#>  
#> █ [4:0x1624ee2aec8] <df[,2]> 
#> ├─x = [2:0x1624f230148] 
#> └─y = [5:0x1624f7aab98] <dbl>

Since dataframes are lists of (column) vectors, mutating a row modifies the value

d1 <- data.frame(x = c(1, 5, 6), y = c(2, 4, 3))
d2 <- d1
d2[1, ] <- d2[1, ] * 2
ref(d1, d2)
#> █ [1:0x1624f91a508] <df[,2]> 
#> ├─x = [2:0x16250095318] <dbl> 
#> └─y = [3:0x162500952c8] <dbl> 
#>  
#> █ [4:0x1624fb324c8] <df[,2]> 
#> ├─x = [5:0x162501700e8] <dbl> 
#> └─y = [6:0x16250170098] <dbl>

Characters are unique due to the global string pool

x <- 1:4
ref(x)
#> [1:0x1624ff60cb0] <int>
y <- 1:4
ref(y)
#> [1:0x162500b4318] <int>
x <- c("a", "a", "b")
ref(x, character = TRUE)
#> █ [1:0x1625089a188] <chr> 
#> ├─[2:0x16241da3118] <string: "a"> 
#> ├─[2:0x16241da3118] 
#> └─[3:0x1624818b3b8] <string: "b">
y <- c("a")
ref(y, character = TRUE)
#> █ [1:0x16250892930] <chr> 
#> └─[2:0x16241da3118] <string: "a">

Memory amount can also be measured, using lobstr::obj_size

banana <- "bananas bananas bananas"
obj_addr(banana)
#> [1] "0x1624bb90318"
obj_size(banana)
#> 136 B

Alternative Representation or ALTREPs represent vector values efficiently

x <- 1:10
obj_size(x)
#> 680 B
y <- 1:10000
obj_size(y)
#> 680 B

We can measure memory & speed using bench::mark()

med <- function(d, medians) {
  for (i in seq_along(medians)) {
    d[[i]] <- d[[i]] - medians[[i]]
  }
}
x <- data.frame(matrix(runif(5 * 1e4), ncol = 5))
medians <- vapply(x, median, numeric(1))
y <- as.list(x)

bench::mark(
  "data.frame" = med(x, medians),
  "list" = med(y, medians)
)[, c("min", "median", "mem_alloc")]
#> # A tibble: 2 × 3
#>        min   median mem_alloc
#>   <bch:tm> <bch:tm> <bch:byt>
#> 1   52.7µs   71.2µs     491KB
#> 2   16.8µs   35.1µs     391KB

The garbage collector gc() explicitly clears out unbound objects

x <- 1:3
x <- 2:4 # "1:3" is orphaned
rm(x) # "2:4" is orphaned
gc()
#>           used (Mb) gc trigger (Mb) max used (Mb)
#> Ncells  791094 42.3    1505455 80.4  1505455 80.4
#> Vcells 1497588 11.5    8388608 64.0  8388528 64.0
lobstr::mem_used() # Wrapper around gc()
#> 56.29 MB