bench::mark
to compare the different versions (and include unit tests)rowSums()
, colSums()
, rowMeans()
, and colMeans()
are faster than equivalent invocations that use apply()
because they are vectorisedvapply()
is faster than sapply()
because it pre-specifies the output typeany(x == 10)
is much faster than 10 %in% x
because testing equality is simpler than testing set inclusionapply()
will always turn a dataframe into a matrixread.csv()
: specify known column types with colClasses
. (Also consider switching to readr::read_csv()
or data.table::fread()
which are considerably faster than read.csv()
.)
factor()
: specify known levels with levels
.
cut()
: don’t generate labels with labels = FALSE
if you don’t need them, or, even better, use findInterval()
as mentioned in the “see also” section of the documentation.
unlist(x, use.names = FALSE)
is much faster than unlist(x)
.
interaction()
: if you only need combinations that exist in the data, use drop = TRUE
.
x <- runif(1e2)
bench::mark(
mean(x),
mean.default(x)
)[c("expression", "min", "median", "itr/sec", "n_gc")]
#> # A tibble: 2 × 4
#> expression min median `itr/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl>
#> 1 mean(x) 2.8µs 3.1µs 295383.
#> 2 mean.default(x) 900ns 1.1µs 839532.
x <- runif(1e2)
bench::mark(
mean(x),
mean.default(x),
.Internal(mean(x))
)[c("expression", "min", "median", "itr/sec", "n_gc")]
#> # A tibble: 3 × 4
#> expression min median `itr/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl>
#> 1 mean(x) 2.7µs 2.9µs 310733.
#> 2 mean.default(x) 900ns 1.1µs 849192.
#> 3 .Internal(mean(x)) 100ns 200ns 4495392.
x <- runif(1e4)
bench::mark(
mean(x),
mean.default(x),
.Internal(mean(x))
)[c("expression", "min", "median", "itr/sec", "n_gc")]
#> # A tibble: 3 × 4
#> expression min median `itr/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl>
#> 1 mean(x) 16.6µs 17.3µs 55604.
#> 2 mean.default(x) 14.4µs 14.9µs 64958.
#> 3 .Internal(mean(x)) 13.6µs 13.7µs 70368.
as.data.frame()
is quite slow because it coerces each element into a data frame and then rbind()
s them togetherquickdf <- function(l) {
class(l) <- "data.frame"
attr(l, "row.names") <- .set_row_names(length(l[[1]]))
l
}
l <- lapply(1:26, function(i) runif(1e3))
names(l) <- letters
bench::mark(
as.data.frame = as.data.frame(l),
quick_df = quickdf(l)
)[c("expression", "min", "median", "itr/sec", "n_gc")]
#> # A tibble: 2 × 4
#> expression min median `itr/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl>
#> 1 as.data.frame 589.5µs 656.2µs 1447.
#> 2 quick_df 3.7µs 4.3µs 207929.
Caveat! This method is fast because it’s dangerous!
rowSums()
, colSums()
, rowMeans()
, and colMeans()
cut()
and findInterval()
for converting continuous variables to categoricalcumsum()
and diff()
random_string <- function() {
paste(sample(letters, 50, replace = TRUE), collapse = "")
}
strings10 <- replicate(10, random_string())
strings100 <- replicate(100, random_string())
collapse <- function(xs) {
out <- ""
for (x in xs) {
out <- paste0(out, x)
}
out
}
bench::mark(
loop10 = collapse(strings10),
loop100 = collapse(strings100),
vec10 = paste(strings10, collapse = ""),
vec100 = paste(strings100, collapse = ""),
check = FALSE
)[c("expression", "min", "median", "itr/sec", "n_gc")]
#> # A tibble: 4 × 4
#> expression min median `itr/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl>
#> 1 loop10 17.1µs 18µs 52743.
#> 2 loop100 460.3µs 491.5µs 1959.
#> 3 vec10 2.7µs 2.9µs 317912.
#> 4 vec100 16.3µs 18.5µs 51297.
#> user system elapsed
#> 0.28 0.00 0.33
# provide two vectors
system.time(
for (i in 1:m) {
t.test(X[i, grp == 1], X[i, grp == 2])$statistic
}
)
#> user system elapsed
#> 0.08 0.00 0.08
Add functionality to save values
compT <- function(i){
t.test(X[i, grp == 1], X[i, grp == 2])$statistic
}
system.time(t1 <- purrr::map_dbl(1:m, compT))
#> user system elapsed
#> 0.09 0.00 0.09
If you look at the source code of stats:::t.test.default()
, you’ll see that it does a lot more than just compute the t-statistic.
# Do less work
my_t <- function(x, grp) {
t_stat <- function(x) {
m <- mean(x)
n <- length(x)
var <- sum((x - m) ^ 2) / (n - 1)
list(m = m, n = n, var = var)
}
g1 <- t_stat(x[grp == 1])
g2 <- t_stat(x[grp == 2])
se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
(g1$m - g2$m) / se_total
}
system.time(t2 <- purrr::map_dbl(1:m, ~ my_t(X[.,], grp)))
#> user system elapsed
#> 0.02 0.00 0.01
This gives us a six-fold speed improvement!
# Vectorise it
rowtstat <- function(X, grp){
t_stat <- function(X) {
m <- rowMeans(X)
n <- ncol(X)
var <- rowSums((X - m) ^ 2) / (n - 1)
list(m = m, n = n, var = var)
}
g1 <- t_stat(X[, grp == 1])
g2 <- t_stat(X[, grp == 2])
se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
(g1$m - g2$m) / se_total
}
system.time(t3 <- rowtstat(X, grp))
#> user system elapsed
#> 0.02 0.00 0.02
1000 times faster than when we started!
Read R blogs to see what performance problems other people have struggled with, and how they have made their code faster.
Read other R programming books, like The Art of R Programming or Patrick Burns’ R Inferno to learn about common traps.
Take an algorithms and data structure course to learn some well known ways of tackling certain classes of problems. I have heard good things about Princeton’s Algorithms course offered on Coursera.
Learn how to parallelise your code. Two places to start are Parallel R and Parallel Computing for Data Science
Read general books about optimisation like Mature optimisation or the Pragmatic Programmer
Read more R code. StackOverflow, R Mailing List, DSLC, GitHub, etc.