5.10 Prediciting continuous variables by regression

# file path for CpG methylation and age
fileMethAge=system.file("extdata",
                      "CpGmeth2Age.rds",
                      package="compGenomRData")

# read methylation-age table
ameth=readRDS(fileMethAge)
dim(ameth)
## [1]   108 27579
summary(ameth[,1:3])
##       Age            cg26211698        cg03790787     
##  Min.   :-0.4986   Min.   :0.01223   Min.   :0.05001  
##  1st Qu.:-0.4027   1st Qu.:0.01885   1st Qu.:0.07818  
##  Median :18.8466   Median :0.02269   Median :0.08964  
##  Mean   :25.9083   Mean   :0.02483   Mean   :0.09300  
##  3rd Qu.:49.6110   3rd Qu.:0.02888   3rd Qu.:0.10423  
##  Max.   :83.6411   Max.   :0.04883   Max.   :0.16271
# plot histogram of methylation values
hist(unlist(ameth[,-1]),border="white",
      col="cornflowerblue",main="",xlab="methylation values")

# remove CpGs with less than 0.1 StDev

ameth=ameth[,c(TRUE,matrixStats::colSds(as.matrix(ameth[,-1]))>0.1)]
dim(ameth)
## [1]  108 2290

Running random forest regression

set.seed(18)

par(mfrow=c(1,2))

# we are not going to do any cross-validation
# and rely on OOB error
trctrl <- trainControl(method = "none")

# we will now train random forest model
rfregFit <- train(Age~., 
               data = ameth, 
               method = "ranger",
               trControl=trctrl,
               # calculate importance
               importance="permutation", 
               tuneGrid = data.frame(mtry=50,
                                     min.node.size = 5,
                                     splitrule="variance")
               )
# plot Observed vs OOB predicted values from the model
plot(ameth$Age,rfregFit$finalModel$predictions,
     pch=19,xlab="observed Age",
     ylab="OOB predicted Age")
mtext(paste("R-squared",
            format(rfregFit$finalModel$r.squared,digits=2)))

# plot residuals
plot(ameth$Age,(rfregFit$finalModel$predictions-ameth$Age),
     pch=18,ylab="residuals (predicted-observed)",
     xlab="observed Age",col="blue3")
abline(h=0,col="red4",lty=2)