## ----knitr, echo=FALSE, results="hide"----------------------------------- library("knitr") opts_chunk$set( tidy=FALSE, dev="png", fig.show="hide", fig.width=4, fig.height=4.5, cache=TRUE, message=FALSE) ## ----style, eval=TRUE, echo=FALSE, results="asis"------------------------ BiocStyle::latex() ## ----loadDESeq2, echo=FALSE---------------------------------------------- library("DESeq2") ## ----options, results="hide", echo=FALSE--------------------------------- options(digits=3, width=80, prompt=" ", continue=" ") ## ----quick, eval=FALSE--------------------------------------------------- ## dds <- DESeqDataSet(se, design = ~ batch + condition) ## dds <- DESeq(dds) ## res <- results(dds, contrast=c("condition","trt","con")) ## ----loadSumExp---------------------------------------------------------- library("airway") data("airway") se <- airway ## ----sumExpInput--------------------------------------------------------- library("DESeq2") ddsSE <- DESeqDataSet(se, design = ~ cell + dex) ddsSE ## ----loadPasilla--------------------------------------------------------- library("pasilla") library("Biobase") data("pasillaGenes") countData <- counts(pasillaGenes) colData <- pData(pasillaGenes)[,c("condition","type")] ## ----showPasilla--------------------------------------------------------- head(countData) head(colData) ## ----matrixInput--------------------------------------------------------- dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design = ~ condition) dds ## ----addFeatureData------------------------------------------------------ featureData <- data.frame(gene=rownames(pasillaGenes)) (mcols(dds) <- DataFrame(mcols(dds), featureData)) ## ----tximport------------------------------------------------------------ library("tximport") library("readr") library("tximportData") dir <- system.file("extdata", package="tximportData") samples <- read.table(file.path(dir,"samples.txt"), header=TRUE) files <- file.path(dir,"salmon", samples$run, "quant.sf") names(files) <- paste0("sample",1:6) tx2gene <- read.csv(file.path(dir, "tx2gene.csv")) txi <- tximport(files, type="salmon", tx2gene=tx2gene, reader=read_tsv) ## ----txi2dds------------------------------------------------------------- coldata <- data.frame(condition=factor(rep(c("A","B"),each=3))) rownames(coldata) <- colnames(txi$counts) ddsTxi <- DESeqDataSetFromTximport(txi, colData=coldata, design=~ condition) ## ----htseqDirI, eval=FALSE----------------------------------------------- ## directory <- "/path/to/your/files/" ## ----htseqDirII---------------------------------------------------------- directory <- system.file("extdata", package="pasilla", mustWork=TRUE) ## ----htseqInput---------------------------------------------------------- sampleFiles <- grep("treated",list.files(directory),value=TRUE) sampleCondition <- sub("(.*treated).*","\\1",sampleFiles) sampleTable <- data.frame(sampleName = sampleFiles, fileName = sampleFiles, condition = sampleCondition) ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = directory, design= ~ condition) ddsHTSeq ## ----prefilter----------------------------------------------------------- dds <- dds[ rowSums(counts(dds)) > 1, ] ## ----factorlvl----------------------------------------------------------- dds$condition <- factor(dds$condition, levels=c("untreated","treated")) ## ----relevel------------------------------------------------------------- dds$condition <- relevel(dds$condition, ref="untreated") ## ----droplevels---------------------------------------------------------- dds$condition <- droplevels(dds$condition) ## ----deseq--------------------------------------------------------------- dds <- DESeq(dds) res <- results(dds) res ## ----parallel, eval=FALSE------------------------------------------------ ## library("BiocParallel") ## register(MulticoreParam(4)) ## ----resOrder------------------------------------------------------------ resOrdered <- res[order(res$padj),] ## ----sumRes-------------------------------------------------------------- summary(res) ## ----sumRes01------------------------------------------------------------ sum(res$padj < 0.1, na.rm=TRUE) ## ----resAlpha05---------------------------------------------------------- res05 <- results(dds, alpha=0.05) summary(res05) sum(res05$padj < 0.05, na.rm=TRUE) ## ----IHW----------------------------------------------------------------- library("IHW") resIHW <- results(dds, filterFun=ihw) summary(resIHW) sum(resIHW$padj < 0.1, na.rm=TRUE) metadata(resIHW)$ihwResult ## ----MA, fig.width=4.5, fig.height=4.5----------------------------------- plotMA(res, main="DESeq2", ylim=c(-2,2)) ## ----MAidentify, eval=FALSE---------------------------------------------- ## idx <- identify(res$baseMean, res$log2FoldChange) ## rownames(res)[idx] ## ----resMLE-------------------------------------------------------------- resMLE <- results(dds, addMLE=TRUE) head(resMLE, 4) ## ----MANoPrior, fig.width=4.5, fig.height=4.5---------------------------- plotMA(resMLE, MLE=TRUE, main="unshrunken LFC", ylim=c(-2,2)) ## ----plotCounts, dev="pdf", fig.width=4.5, fig.height=5------------------ plotCounts(dds, gene=which.min(res$padj), intgroup="condition") ## ----plotCountsAdv, dev="pdf", fig.width=3.5, fig.height=3.5------------- d <- plotCounts(dds, gene=which.min(res$padj), intgroup="condition", returnData=TRUE) library("ggplot2") ggplot(d, aes(x=condition, y=count)) + geom_point(position=position_jitter(w=0.1,h=0)) + scale_y_log10(breaks=c(25,100,400)) ## ----metadata------------------------------------------------------------ mcols(res)$description ## ----export, eval=FALSE-------------------------------------------------- ## write.csv(as.data.frame(resOrdered), ## file="condition_treated_results.csv") ## ----subset-------------------------------------------------------------- resSig <- subset(resOrdered, padj < 0.1) resSig ## ----multifactor--------------------------------------------------------- colData(dds) ## ----copyMultifactor----------------------------------------------------- ddsMF <- dds ## ----replaceDesign------------------------------------------------------- design(ddsMF) <- formula(~ type + condition) ddsMF <- DESeq(ddsMF) ## ----multiResults-------------------------------------------------------- resMF <- results(ddsMF) head(resMF) ## ----multiTypeResults---------------------------------------------------- resMFType <- results(ddsMF, contrast=c("type","single-read","paired-end")) head(resMFType) ## ----rlogAndVST---------------------------------------------------------- rld <- rlog(dds, blind=FALSE) vsd <- varianceStabilizingTransformation(dds, blind=FALSE) vsd.fast <- vst(dds, blind=FALSE) head(assay(rld), 3) ## ----vsd1, echo=FALSE, fig.width=4.5, fig.height=4.5--------------------- px <- counts(dds)[,1] / sizeFactors(dds)[1] ord <- order(px) ord <- ord[px[ord] < 150] ord <- ord[seq(1, length(ord), length=50)] last <- ord[length(ord)] vstcol <- c("blue", "black") matplot(px[ord], cbind(assay(vsd)[, 1], log2(px))[ord, ], type="l", lty=1, col=vstcol, xlab="n", ylab="f(n)") legend("bottomright", legend = c( expression("variance stabilizing transformation"), expression(log[2](n/s[1]))), fill=vstcol) ## ----log2meansd, fig.width=4, fig.height=3------------------------------- library("vsn") notAllZero <- (rowSums(counts(dds))>0) meanSdPlot(log2(counts(dds,normalized=TRUE)[notAllZero,] + 1)) ## ----rlogmeansd, fig.width=4, fig.height=3------------------------------- meanSdPlot(assay(rld[notAllZero,])) ## ----vstmeansd, fig.width=4, fig.height=3-------------------------------- meanSdPlot(assay(vsd[notAllZero,])) ## ----heatmap------------------------------------------------------------- library("pheatmap") select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20] ## ----figHeatmap2a, dev="pdf", fig.width=5, fig.height=7------------------ nt <- normTransform(dds) # defaults to log2(x+1) log2.norm.counts <- assay(nt)[select,] df <- as.data.frame(colData(dds)[,c("condition","type")]) pheatmap(log2.norm.counts, cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) ## ----figHeatmap2b, dev="pdf", fig.width=5, fig.height=7------------------ pheatmap(assay(rld)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) ## ----figHeatmap2c, dev="pdf", fig.width=5, fig.height=7------------------ pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) ## ----sampleClust--------------------------------------------------------- sampleDists <- dist(t(assay(rld))) ## ----figHeatmapSamples, dev="pdf", fig.width=7, fig.height=7------------- library("RColorBrewer") sampleDistMatrix <- as.matrix(sampleDists) rownames(sampleDistMatrix) <- paste(rld$condition, rld$type, sep="-") colnames(sampleDistMatrix) <- NULL colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) pheatmap(sampleDistMatrix, clustering_distance_rows=sampleDists, clustering_distance_cols=sampleDists, col=colors) ## ----figPCA, dev="pdf", fig.width=5, fig.height=3------------------------ plotPCA(rld, intgroup=c("condition", "type")) ## ----figPCA2, dev="pdf", fig.width=5, fig.height=3----------------------- data <- plotPCA(rld, intgroup=c("condition", "type"), returnData=TRUE) percentVar <- round(100 * attr(data, "percentVar")) ggplot(data, aes(PC1, PC2, color=condition, shape=type)) + geom_point(size=3) + xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance")) ## ----WaldTest, eval=FALSE------------------------------------------------ ## dds <- estimateSizeFactors(dds) ## dds <- estimateDispersions(dds) ## dds <- nbinomWaldTest(dds) ## ----simpleContrast, eval=FALSE------------------------------------------ ## results(dds, contrast=c("condition","C","B")) ## ----combineFactors, eval=FALSE------------------------------------------ ## dds$group <- factor(paste0(dds$genotype, dds$condition)) ## design(dds) <- ~ group ## dds <- DESeq(dds) ## resultsNames(dds) ## results(dds, contrast=c("group", "IB", "IA")) ## ----interFig, dev="pdf", fig.width=4, fig.height=3, echo=FALSE, results="hide"---- npg <- 20 mu <- 2^c(8,10,9,11,10,12) cond <- rep(rep(c("A","B"),each=npg),3) geno <- rep(c("I","II","III"),each=2*npg) table(cond, geno) counts <- rnbinom(6*npg, mu=rep(mu,each=npg), size=1/.01) d <- data.frame(log2c=log2(counts+1), cond, geno) library(ggplot2) plotit <- function(d, title) { ggplot(d, aes(x=cond, y=log2c, group=geno)) + geom_jitter(size=1.5, position = position_jitter(width=.15)) + facet_wrap(~ geno) + stat_summary(fun.y=mean, geom="line", colour="red", size=0.8) + xlab("condition") + ylab("log2(counts+1)") + ggtitle(title) } plotit(d, "Gene 1") + ylim(7,13) lm(log2c ~ cond + geno + geno:cond, data=d) ## ----interFig2, dev="pdf", fig.width=4, fig.height=3, echo=FALSE, results="hide"---- mu[4] <- 2^12 mu[6] <- 2^8 counts <- rnbinom(6*npg, mu=rep(mu,each=npg), size=1/.01) d2 <- data.frame(log2c=log2(counts + 1), cond, geno) plotit(d2, "Gene 2") + ylim(7,13) lm(log2c ~ cond + geno + geno:cond, data=d2) ## ----boxplotCooks-------------------------------------------------------- par(mar=c(8,5,2,2)) boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) ## ----dispFit------------------------------------------------------------- plotDispEsts(dds) ## ----dispFitCustom------------------------------------------------------- ddsCustom <- dds useForMedian <- mcols(ddsCustom)$dispGeneEst > 1e-7 medianDisp <- median(mcols(ddsCustom)$dispGeneEst[useForMedian],na.rm=TRUE) dispersionFunction(ddsCustom) <- function(mu) medianDisp ddsCustom <- estimateDispersionsMAP(ddsCustom) ## ----filtByMean, dev="pdf"----------------------------------------------- metadata(res)$alpha metadata(res)$filterThreshold plot(metadata(res)$filterNumRej, type="b", ylab="number of rejections", xlab="quantiles of filter") lines(metadata(res)$lo.fit, col="red") abline(v=metadata(res)$filterTheta) ## ----noFilt-------------------------------------------------------------- resNoFilt <- results(dds, independentFiltering=FALSE) addmargins(table(filtering=(res$padj < .1), noFiltering=(resNoFilt$padj < .1))) ## ----ddsNoPrior---------------------------------------------------------- ddsNoPrior <- DESeq(dds, betaPrior=FALSE) ## ----lfcThresh----------------------------------------------------------- par(mfrow=c(2,2),mar=c(2,2,1,1)) yl <- c(-2.5,2.5) resGA <- results(dds, lfcThreshold=.5, altHypothesis="greaterAbs") resLA <- results(ddsNoPrior, lfcThreshold=.5, altHypothesis="lessAbs") resG <- results(dds, lfcThreshold=.5, altHypothesis="greater") resL <- results(dds, lfcThreshold=.5, altHypothesis="less") plotMA(resGA, ylim=yl) abline(h=c(-.5,.5),col="dodgerblue",lwd=2) plotMA(resLA, ylim=yl) abline(h=c(-.5,.5),col="dodgerblue",lwd=2) plotMA(resG, ylim=yl) abline(h=.5,col="dodgerblue",lwd=2) plotMA(resL, ylim=yl) abline(h=-.5,col="dodgerblue",lwd=2) ## ----mcols--------------------------------------------------------------- mcols(dds,use.names=TRUE)[1:4,1:4] # here using substr() only for display purposes substr(names(mcols(dds)),1,10) mcols(mcols(dds), use.names=TRUE)[1:4,] ## ----muAndCooks---------------------------------------------------------- head(assays(dds)[["mu"]]) head(assays(dds)[["cooks"]]) ## ----dispersions--------------------------------------------------------- head(dispersions(dds)) # which is the same as head(mcols(dds)$dispersion) ## ----sizefactors--------------------------------------------------------- sizeFactors(dds) ## ----coef---------------------------------------------------------------- head(coef(dds)) ## ----betaPriorVar-------------------------------------------------------- attr(dds, "betaPriorVar") ## ----dispPriorVar-------------------------------------------------------- dispersionFunction(dds) attr(dispersionFunction(dds), "dispPriorVar") ## ----versionNum---------------------------------------------------------- metadata(dds)[["version"]] ## ----normFactors, eval=FALSE--------------------------------------------- ## normFactors <- normFactors / exp(rowMeans(log(normFactors))) ## normalizationFactors(dds) <- normFactors ## ----offsetTransform, eval=FALSE----------------------------------------- ## cqnOffset <- cqnObject$glm.offset ## cqnNormFactors <- exp(cqnOffset) ## EDASeqNormFactors <- exp(-1 * EDASeqOffset) ## ----lineardep, echo=FALSE----------------------------------------------- data.frame(batch=factor(c(1,1,2,2)), condition=factor(c("A","A","B","B"))) ## ----lineardep2, echo=FALSE---------------------------------------------- data.frame(batch=factor(c(1,1,1,1,2,2)), condition=factor(c("A","A","B","B","C","C"))) ## ----lineardep3, echo=FALSE---------------------------------------------- data.frame(batch=factor(c(1,1,1,2,2,2)), condition=factor(c("A","B","C","A","B","C"))) ## ----groupeffect--------------------------------------------------------- (coldata <- data.frame(grp=factor(rep(c("X","Y"),each=4)), ind=factor(rep(1:4,each=2)), cnd=factor(rep(c("A","B"),4)))) ## ----groupeffect2-------------------------------------------------------- coldata$ind.n <- factor(rep(rep(1:2,each=2),2)) coldata ## ----groupeffect3-------------------------------------------------------- model.matrix(~ grp + grp:ind.n + grp:cnd, coldata) ## ----groupeffect4, eval=FALSE-------------------------------------------- ## results(dds, contrast=list("grpY.cndB","grpX.cndB")) ## ----missingcombo-------------------------------------------------------- group <- factor(rep(1:3,each=6)) condition <- factor(rep(rep(c("A","B","C"),each=2),3)) (d <- data.frame(group, condition)[-c(17,18),]) ## ----missingcombo2------------------------------------------------------- m1 <- model.matrix(~ condition*group, d) colnames(m1) unname(m1) ## ----missingcombo3------------------------------------------------------- m1 <- m1[,-9] unname(m1) ## ----cooksPlot----------------------------------------------------------- W <- res$stat maxCooks <- apply(assays(dds)[["cooks"]],1,max) idx <- !is.na(W) plot(rank(W[idx]), maxCooks[idx], xlab="rank of Wald statistic", ylab="maximum Cook's distance per gene", ylim=c(0,5), cex=.4, col=rgb(0,0,0,.3)) m <- ncol(dds) p <- 3 abline(h=qf(.99, p, m - p)) ## ----indFilt------------------------------------------------------------- plot(res$baseMean+1, -log10(res$pvalue), log="x", xlab="mean of normalized counts", ylab=expression(-log[10](pvalue)), ylim=c(0,30), cex=.4, col=rgb(0,0,0,.3)) ## ----histindepfilt, dev="pdf", fig.width=7, fig.height=5----------------- use <- res$baseMean > metadata(res)$filterThreshold h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE) h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE) colori <- c(`do not pass`="khaki", `pass`="powderblue") ## ----fighistindepfilt---------------------------------------------------- barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, main = "", ylab="frequency") text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA) legend("topright", fill=rev(colori), legend=rev(names(colori))) ## ----vanillaDESeq, eval=FALSE-------------------------------------------- ## dds <- DESeq(dds, minReplicatesForReplace=Inf) ## res <- results(dds, cooksCutoff=FALSE, independentFiltering=FALSE) ## ----varGroup, echo=FALSE, fig.width=5, fig.height=5--------------------- set.seed(3) dds1 <- makeExampleDESeqDataSet(n=1000,m=12,betaSD=.3,dispMeanRel=function(x) 0.01) dds2 <- makeExampleDESeqDataSet(n=1000,m=12, betaSD=.3, interceptMean=mcols(dds1)$trueIntercept, interceptSD=0, dispMeanRel=function(x) 0.2) dds2 <- dds2[,7:12] dds2$condition <- rep("C",6) mcols(dds2) <- NULL dds <- cbind(dds1, dds2) rld <- rlog(dds, blind=FALSE, fitType="mean") plotPCA(rld) ## ----overShrink, echo=FALSE, fig.width=5, fig.height=5------------------- plot(c(10^rnorm(1000, 3, 2),300,2000,5000), c(rnorm(1000, 0, .15), -5.5, -8.5, 7.5), ylim=c(-10,10), log="x", cex=.4, xlab="mean of normalized counts", ylab="log2 fold change") abline(h=0, col=rgb(1,0,0,.7)) ## ----convertNA, eval=FALSE----------------------------------------------- ## res$padj <- ifelse(is.na(res$padj), 1, res$padj) ## ----sessInfo, results="asis", echo=FALSE-------------------------------- toLatex(sessionInfo()) ## ----resetOptions, results="hide", echo=FALSE---------------------------- options(prompt="> ", continue="+ ")