8.4 Differential expression analysis
Limiting factors for detecting real changes between biological conditions:
- number of biological replicates
- non-normality of the distribution of the read counts
- higher uncertainty of measurements for lowly expressed genes than highly expressed genes
DESeq workflow
:
- preparation of data
#remove the 'width' column
countData <- as.matrix(subset(counts, select = c(-width)))
#define the experimental setup
colData <- read.table(coldata_file, header = T, sep = '\t',
stringsAsFactors = TRUE)
#define the design formula
designFormula <- "~ group"
- setting up dds
library(DESeq2)
library(stats)
#create a DESeq dataset object from the count matrix and the colData
dds <- DESeqDataSetFromMatrix(countData = countData,
colData = colData,
design = as.formula(designFormula))
#print dds object to see the contents
print(dds)
## class: DESeqDataSet
## dim: 19719 10
## metadata(1): version
## assays(1): counts
## rownames(19719): TSPAN6 TNMD ... MYOCOS HSFX3
## rowData names(0):
## colnames(10): CASE_1 CASE_2 ... CTRL_4 CTRL_5
## colData names(2): source_name group
#For each gene, we count the total number of reads for that gene in all samples
#and remove those that don't have at least 1 read.
dds <- dds[ rowSums(DESeq2::counts(dds)) > 1, ]
- wrapper function that implements estimation of size factors to normalize the counts, estimation of dispersion values, and computing a GLM model based on the experimental design formula
- extract results
#compute the contrast for the 'group' variable where 'CTRL'
#samples are used as the control group.
DEresults = results(dds, contrast = c("group", 'CASE', 'CTRL'))
#sort results by increasing p-value
DEresults <- DEresults[order(DEresults$pvalue),]
## log2 fold change (MLE): group CASE vs CTRL
## Wald test p-value: group CASE vs CTRL
## DataFrame with 19097 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## CYP2E1 4829889 9.36024 0.215223 43.4909 0.00000e+00
## FCGBP 10349993 -7.57579 0.186433 -40.6355 0.00000e+00
## ASGR2 426422 8.01830 0.216207 37.0863 4.67898e-301
## GCKR 100183 7.82841 0.233376 33.5442 1.09479e-246
## APOA5 438054 10.20248 0.312503 32.6477 8.64906e-234
## ... ... ... ... ... ...
## CCDC195 20.4981 -0.215607 2.89255 -0.0745386 NA
## SPEM3 23.6370 -22.154765 3.02785 -7.3170030 NA
## AC022167.5 21.8451 -2.056240 2.89545 -0.7101618 NA
## BX276092.9 29.9636 0.407326 2.89048 0.1409199 NA
## ETDC 22.5675 -1.795274 2.89421 -0.6202983 NA
## padj
## <numeric>
## CYP2E1 0.00000e+00
## FCGBP 0.00000e+00
## ASGR2 2.87741e-297
## GCKR 5.04945e-243
## APOA5 3.19133e-230
## ... ...
## CCDC195 NA
## SPEM3 NA
## AC022167.5 NA
## BX276092.9 NA
## ETDC NA