24.9 Case study: t-test
m <- 1000
n <- 50
X <- matrix(rnorm(m * n, mean = 10, sd = 3), nrow = m)
grp <- rep(1:2, each = n / 2)
# formula interface
system.time(
for (i in 1:m) {
t.test(X[i, ] ~ grp)$statistic
}
)
#> user system elapsed
#> 0.401 0.000 0.401
# provide two vectors
system.time(
for (i in 1:m) {
t.test(X[i, grp == 1], X[i, grp == 2])$statistic
}
)
#> user system elapsed
#> 0.116 0.000 0.116
Add functionality to save values
compT <- function(i){
t.test(X[i, grp == 1], X[i, grp == 2])$statistic
}
system.time(t1 <- purrr::map_dbl(1:m, compT))
#> user system elapsed
#> 0.116 0.000 0.117
If you look at the source code of stats:::t.test.default(), you’ll see that it does a lot more than just compute the t-statistic.
# Do less work
my_t <- function(x, grp) {
t_stat <- function(x) {
m <- mean(x)
n <- length(x)
var <- sum((x - m) ^ 2) / (n - 1)
list(m = m, n = n, var = var)
}
g1 <- t_stat(x[grp == 1])
g2 <- t_stat(x[grp == 2])
se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
(g1$m - g2$m) / se_total
}
system.time(t2 <- purrr::map_dbl(1:m, ~ my_t(X[.,], grp)))
#> user system elapsed
#> 0.028 0.000 0.028
stopifnot(all.equal(t1, t2))
This gives us a six-fold speed improvement!
# Vectorise it
rowtstat <- function(X, grp){
t_stat <- function(X) {
m <- rowMeans(X)
n <- ncol(X)
var <- rowSums((X - m) ^ 2) / (n - 1)
list(m = m, n = n, var = var)
}
g1 <- t_stat(X[, grp == 1])
g2 <- t_stat(X[, grp == 2])
se_total <- sqrt(g1$var / g1$n + g2$var / g2$n)
(g1$m - g2$m) / se_total
}
system.time(t3 <- rowtstat(X, grp))
#> user system elapsed
#> 0.012 0.000 0.013
stopifnot(all.equal(t1, t3))
1000 times faster than when we started!