13.15 A Re-Sampling Approach

Re-sampling approach to hypothesis testing using the Khan dataset.

attach(Khan)
x <- rbind(xtrain, xtest)
y <- c(as.numeric(ytrain), as.numeric(ytest))
dim(x)

## [1]   83 2308

table(y)

## y
##  1  2  3  4 
## 11 29 18 25

There are four classes of cancer.

For each gene, we compare the mean expression in the second class (rhabdomyosarcoma) to the mean expression in the fourth class (Burkitt’s lymphoma).

x <- as.matrix(x)
x1 <- x[which(y == 2), ]
x2 <- x[which(y == 4), ]
n1 <- nrow(x1)
n2 <- nrow(x2)
t.out <- t.test(x1[, 11], x2[, 11], var.equal = TRUE)
TT <- t.out$statistic
TT

##         t 
## -2.093633

t.out$p.value

## [1] 0.04118644

Instead of using this theoretical null distribution, we can randomly split the 54 patients into two groups of 29 and 25, and compute a new test statistic.

Repeating this process 10,000 times:

set.seed(1)
B <- 10000
Tbs <- rep(NA, B)
for (b in 1:B) {
   dat <- sample(c(x1[, 11], x2[, 11]))
   Tbs[b] <- t.test(dat[1:n1], dat[(n1 + 1):(n1 + n2)],
        var.equal = TRUE
      )$statistic
}
mean((abs(Tbs) >= abs(TT)))

## [1] 0.0416

This fraction, $0.0416$ , is our re-sampling-based $p$ -value. It is almost identical to the $p$ -value of $0.0412$ obtained using the theoretical null distribution.

A histogram of the re-sampling-based test statistics:

hist(Tbs, breaks = 100, xlim = c(-4.2, 4.2), main = "",
    xlab = "Null Distribution of Test Statistic", col = 7)
lines(seq(-4.2, 4.2, len = 1000),
    dt(seq(-4.2, 4.2, len = 1000),
      df = (n1 + n2 - 2)
    ) * 1000, col = 2, lwd = 3)
abline(v = TT, col = 4, lwd = 2)
text(TT + 0.5, 350, paste("T = ", round(TT, 4), sep = ""),
    col = 4)

For each gene, we first compute the observed test statistic,

m <- 100
B<-1000
set.seed(1)
index <- sample(ncol(x1), m)
Ts <- rep(NA, m)
Ts.star <- matrix(NA, ncol = m, nrow = B)
for (j in 1:m) {
  k <- index[j]
  Ts[j] <- t.test(x1[, k], x2[, k],
        var.equal = TRUE
      )$statistic
  for (b in 1:B) {
    dat <- sample(c(x1[, k], x2[, k]))
    Ts.star[b, j] <- t.test(dat[1:n1],
         dat[(n1 + 1):(n1 + n2)], var.equal = TRUE
       )$statistic
  }
}

Compute a number of rejected null hypotheses $R$ .

cs <- sort(abs(Ts))
FDRs <- Rs <- Vs <- rep(NA, m)
for (j in 1:m) {
  R <- sum(abs(Ts) >= cs[j])
  V <- sum(abs(Ts.star) >= cs[j]) / B
  Rs[j] <- R
  Vs[j] <- V
  FDRs[j] <- V / R
}

The variable index is needed here since we restricted our analysis to just $100$ randomly-selected genes.

max(Rs[FDRs <= .1])

## [1] 15

sort(index[abs(Ts) >= min(cs[FDRs < .1])])

##  [1]   29  465  501  554  573  729  733 1301 1317 1640 1646 1706 1799 1942 2159

max(Rs[FDRs <= .2])

## [1] 28

sort(index[abs(Ts) >= min(cs[FDRs < .2])])

##  [1]   29   40  287  361  369  465  501  554  573  679  729  733  990 1069 1073
## [16] 1301 1317 1414 1639 1640 1646 1706 1799 1826 1942 1974 2087 2159

plot(Rs, FDRs, xlab = "Number of Rejections", type = "l",
    ylab = "False Discovery Rate", col = 4, lwd = 3)