13.15 A Re-Sampling Approach
Re-sampling approach to hypothesis testing using the Khan
dataset.
attach(Khan)
<- rbind(xtrain, xtest)
x <- c(as.numeric(ytrain), as.numeric(ytest))
y dim(x)
## [1] 83 2308
table(y)
## y
## 1 2 3 4
## 11 29 18 25
There are four classes of cancer.
For each gene, we compare the mean expression in the second class (rhabdomyosarcoma) to the mean expression in the fourth class (Burkitt’s lymphoma).
<- as.matrix(x)
x <- x[which(y == 2), ]
x1 <- x[which(y == 4), ]
x2 <- nrow(x1)
n1 <- nrow(x2)
n2 <- t.test(x1[, 11], x2[, 11], var.equal = TRUE)
t.out <- t.out$statistic
TT TT
## t
## -2.093633
$p.value t.out
## [1] 0.04118644
Instead of using this theoretical null distribution, we can randomly split the 54 patients into two groups of 29 and 25, and compute a new test statistic.
Repeating this process 10,000 times:
set.seed(1)
<- 10000
B <- rep(NA, B)
Tbs for (b in 1:B) {
<- sample(c(x1[, 11], x2[, 11]))
dat <- t.test(dat[1:n1], dat[(n1 + 1):(n1 + n2)],
Tbs[b] var.equal = TRUE
$statistic
)
}mean((abs(Tbs) >= abs(TT)))
## [1] 0.0416
This fraction, \(0.0416\), is our re-sampling-based \(p\)-value. It is almost identical to the \(p\)-value of \(0.0412\) obtained using the theoretical null distribution.
A histogram of the re-sampling-based test statistics:
hist(Tbs, breaks = 100, xlim = c(-4.2, 4.2), main = "",
xlab = "Null Distribution of Test Statistic", col = 7)
lines(seq(-4.2, 4.2, len = 1000),
dt(seq(-4.2, 4.2, len = 1000),
df = (n1 + n2 - 2)
* 1000, col = 2, lwd = 3)
) abline(v = TT, col = 4, lwd = 2)
text(TT + 0.5, 350, paste("T = ", round(TT, 4), sep = ""),
col = 4)
For each gene, we first compute the observed test statistic,
<- 100
m <-1000
Bset.seed(1)
<- sample(ncol(x1), m)
index <- rep(NA, m)
Ts <- matrix(NA, ncol = m, nrow = B)
Ts.star for (j in 1:m) {
<- index[j]
k <- t.test(x1[, k], x2[, k],
Ts[j] var.equal = TRUE
$statistic
)for (b in 1:B) {
<- sample(c(x1[, k], x2[, k]))
dat <- t.test(dat[1:n1],
Ts.star[b, j] + 1):(n1 + n2)], var.equal = TRUE
dat[(n1 $statistic
)
} }
Compute a number of rejected null hypotheses \(R\).
<- sort(abs(Ts))
cs <- Rs <- Vs <- rep(NA, m)
FDRs for (j in 1:m) {
<- sum(abs(Ts) >= cs[j])
R <- sum(abs(Ts.star) >= cs[j]) / B
V <- R
Rs[j] <- V
Vs[j] <- V / R
FDRs[j] }
The variable index
is needed here since we restricted our analysis to just \(100\) randomly-selected genes.
max(Rs[FDRs <= .1])
## [1] 15
sort(index[abs(Ts) >= min(cs[FDRs < .1])])
## [1] 29 465 501 554 573 729 733 1301 1317 1640 1646 1706 1799 1942 2159
max(Rs[FDRs <= .2])
## [1] 28
sort(index[abs(Ts) >= min(cs[FDRs < .2])])
## [1] 29 40 287 361 369 465 501 554 573 679 729 733 990 1069 1073
## [16] 1301 1317 1414 1639 1640 1646 1706 1799 1826 1942 1974 2087 2159
plot(Rs, FDRs, xlab = "Number of Rejections", type = "l",
ylab = "False Discovery Rate", col = 4, lwd = 3)