## ----setup, include=FALSE----------------------------------------------------- options(width=80) knitr::opts_chunk$set(collapse=TRUE, message=FALSE, comment="") ## ----library_install, message=FALSE, cache=FALSE, eval=FALSE------------------ # install.packages("BiocManager") # BiocManager::install("GSVA") ## ----load_library, message=FALSE, warning=FALSE, cache=FALSE------------------ library(GSVA) ## ----------------------------------------------------------------------------- p <- 10000 ## number of genes n <- 30 ## number of samples ## simulate expression values from a standard Gaussian distribution X <- matrix(rnorm(p*n), nrow=p, dimnames=list(paste0("g", 1:p), paste0("s", 1:n))) X[1:5, 1:5] ## ----------------------------------------------------------------------------- ## sample gene set sizes gs <- as.list(sample(10:100, size=100, replace=TRUE)) ## sample gene sets gs <- lapply(gs, function(n, p) paste0("g", sample(1:p, size=n, replace=FALSE)), p) names(gs) <- paste0("gs", 1:length(gs)) ## ----------------------------------------------------------------------------- gsvaPar <- gsvaParam(X, gs) gsvaPar ## ----------------------------------------------------------------------------- gsva.es <- gsva(gsvaPar, verbose=FALSE) dim(gsva.es) gsva.es[1:5, 1:5] ## ----------------------------------------------------------------------------- class(gs) length(gs) head(lapply(gs, head)) ## ----message=FALSE------------------------------------------------------------ library(org.Hs.eg.db) goannot <- select(org.Hs.eg.db, keys=keys(org.Hs.eg.db), columns="GO") head(goannot) genesbygo <- split(goannot$ENTREZID, goannot$GO) length(genesbygo) head(genesbygo) ## ----message=FALSE------------------------------------------------------------ library(GSEABase) library(GSVAdata) data(c2BroadSets) class(c2BroadSets) c2BroadSets ## ----eval=FALSE--------------------------------------------------------------- # library(GSEABase) # library(GSVA) # # URL <- "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/c7.immunesigdb.v2024.1.Hs.symbols.gmt" # c7.genesets <- readGMT(URL) ## ----echo=FALSE, message=FALSE, warning=FALSE--------------------------------- library(GSEABase) library(GSVA) gmtfname <- system.file("extdata", "c7.immunesigdb.v2024.1.Hs.symbols.gmt.gz", package="GSVAdata") c7.genesets <- readGMT(gmtfname) c7.genesets ## ----eval=FALSE--------------------------------------------------------------- # gsvaAnnotation(c7.genesets) <- SymbolIdentifier("org.Hs.eg.db") ## ----message=FALSE, error=TRUE------------------------------------------------ try({ fname <- system.file("extdata", "c2.subsetdups.v7.5.symbols.gmt.gz", package="GSVAdata") c2.dupgenesets <- getGmt(fname, geneIdType=SymbolIdentifier()) c2.dupgenesets }) ## ----------------------------------------------------------------------------- c2.dupgenesets <- readGMT(fname, geneIdType=SymbolIdentifier()) c2.dupgenesets any(duplicated(names(c2.dupgenesets))) ## ----message=FALSE------------------------------------------------------------ library(Biobase) data(commonPickrellHuang) stopifnot(identical(featureNames(huangArrayRMAnoBatchCommon_eset), featureNames(pickrellCountsArgonneCQNcommon_eset))) stopifnot(identical(sampleNames(huangArrayRMAnoBatchCommon_eset), sampleNames(pickrellCountsArgonneCQNcommon_eset))) ## ----echo=FALSE--------------------------------------------------------------- ## until the updated GSVAdata goes through the build system ## remove duplicated rows fnames <- featureNames(huangArrayRMAnoBatchCommon_eset) mask <- duplicated(fnames) huangArrayRMAnoBatchCommon_eset <- huangArrayRMAnoBatchCommon_eset[!mask, ] pickrellCountsArgonneCQNcommon_eset <- pickrellCountsArgonneCQNcommon_eset[!mask, ] ## ----------------------------------------------------------------------------- canonicalC2BroadSets <- c2BroadSets[c(grep("^KEGG", names(c2BroadSets)), grep("^REACTOME", names(c2BroadSets)), grep("^BIOCARTA", names(c2BroadSets)))] canonicalC2BroadSets ## ----------------------------------------------------------------------------- data(genderGenesEntrez) MSY <- GeneSet(msYgenesEntrez, geneIdType=EntrezIdentifier(), collectionType=BroadCollection(category="c2"), setName="MSY") MSY XiE <- GeneSet(XiEgenesEntrez, geneIdType=EntrezIdentifier(), collectionType=BroadCollection(category="c2"), setName="XiE") XiE canonicalC2BroadSets <- GeneSetCollection(c(canonicalC2BroadSets, MSY, XiE)) canonicalC2BroadSets ## ----results="hide"----------------------------------------------------------- huangPar <- gsvaParam(huangArrayRMAnoBatchCommon_eset, canonicalC2BroadSets, minSize=5, maxSize=500) esmicro <- gsva(huangPar) pickrellPar <- gsvaParam(pickrellCountsArgonneCQNcommon_eset, canonicalC2BroadSets, minSize=5, maxSize=500, kcdf="Poisson") esrnaseq <- gsva(pickrellPar) ## ----------------------------------------------------------------------------- library(edgeR) lcpms <- cpm(exprs(pickrellCountsArgonneCQNcommon_eset), log=TRUE) ## ----------------------------------------------------------------------------- genecorrs <- sapply(1:nrow(lcpms), function(i, expmicro, exprnaseq) cor(expmicro[i, ], exprnaseq[i, ], method="spearman"), exprs(huangArrayRMAnoBatchCommon_eset), lcpms) names(genecorrs) <- rownames(lcpms) ## ----------------------------------------------------------------------------- pwycorrs <- sapply(1:nrow(esmicro), function(i, esmicro, esrnaseq) cor(esmicro[i, ], esrnaseq[i, ], method="spearman"), exprs(esmicro), exprs(esrnaseq)) names(pwycorrs) <- rownames(esmicro) ## ----compcorrs, echo=FALSE, height=500, width=1000, fig.cap="Comparison of correlation values of gene and pathway expression profiles derived from microarray and RNA-seq data."---- par(mfrow=c(1, 2), mar=c(4, 5, 3, 2)) hist(genecorrs, xlab="Spearman correlation", main="Gene level\n(RNA-seq log-CPMs vs microarray RMA)", xlim=c(-1, 1), col="grey", las=1) hist(pwycorrs, xlab="Spearman correlation", main="Pathway level\n(GSVA enrichment scores)", xlim=c(-1, 1), col="grey", las=1) ## ----compsexgenesets, echo=FALSE, height=500, width=1000, fig.cap="Comparison of GSVA enrichment scores obtained from microarray and RNA-seq data for two gene sets formed by genes with sex-specific expression."---- par(mfrow=c(1, 2)) rmsy <- cor(exprs(esrnaseq)["MSY", ], exprs(esmicro)["MSY", ]) plot(exprs(esrnaseq)["MSY", ], exprs(esmicro)["MSY", ], xlab="GSVA scores RNA-seq", ylab="GSVA scores microarray", main=sprintf("MSY R=%.2f", rmsy), las=1, type="n") fit <- lm(exprs(esmicro)["MSY", ] ~ exprs(esrnaseq)["MSY", ]) abline(fit, lwd=2, lty=2, col="grey") maskPickrellFemale <- pickrellCountsArgonneCQNcommon_eset$Gender == "Female" maskHuangFemale <- huangArrayRMAnoBatchCommon_eset$Gender == "Female" points(exprs(esrnaseq["MSY", maskPickrellFemale]), exprs(esmicro)["MSY", maskHuangFemale], col="red", pch=21, bg="red", cex=1) maskPickrellMale <- pickrellCountsArgonneCQNcommon_eset$Gender == "Male" maskHuangMale <- huangArrayRMAnoBatchCommon_eset$Gender == "Male" points(exprs(esrnaseq)["MSY", maskPickrellMale], exprs(esmicro)["MSY", maskHuangMale], col="blue", pch=21, bg="blue", cex=1) legend("topleft", c("female", "male"), pch=21, col=c("red", "blue"), pt.bg=c("red", "blue"), inset=0.01) rxie <- cor(exprs(esrnaseq)["XiE", ], exprs(esmicro)["XiE", ]) plot(exprs(esrnaseq)["XiE", ], exprs(esmicro)["XiE", ], xlab="GSVA scores RNA-seq", ylab="GSVA scores microarray", main=sprintf("XiE R=%.2f", rxie), las=1, type="n") fit <- lm(exprs(esmicro)["XiE", ] ~ exprs(esrnaseq)["XiE", ]) abline(fit, lwd=2, lty=2, col="grey") points(exprs(esrnaseq["XiE", maskPickrellFemale]), exprs(esmicro)["XiE", maskHuangFemale], col="red", pch=21, bg="red", cex=1) points(exprs(esrnaseq)["XiE", maskPickrellMale], exprs(esmicro)["XiE", maskHuangMale], col="blue", pch=21, bg="blue", cex=1) legend("topleft", c("female", "male"), pch=21, col=c("red", "blue"), pt.bg=c("red", "blue"), inset=0.01) ## ----------------------------------------------------------------------------- data(gbm_VerhaakEtAl) gbm_eset head(featureNames(gbm_eset)) table(gbm_eset$subtype) data(brainTxDbSets) lengths(brainTxDbSets) lapply(brainTxDbSets, head) ## ----results="hide"----------------------------------------------------------- gbmPar <- gsvaParam(gbm_eset, brainTxDbSets, maxDiff=FALSE) gbm_es <- gsva(gbmPar) ## ----gbmSignature, height=500, width=700, fig.cap="Heatmap of GSVA scores for cell-type brain signatures from murine models (y-axis) across GBM samples grouped by GBM subtype."---- library(RColorBrewer) subtypeOrder <- c("Proneural", "Neural", "Classical", "Mesenchymal") sampleOrderBySubtype <- sort(match(gbm_es$subtype, subtypeOrder), index.return=TRUE)$ix subtypeXtable <- table(gbm_es$subtype) subtypeColorLegend <- c(Proneural="red", Neural="green", Classical="blue", Mesenchymal="orange") geneSetOrder <- c("astroglia_up", "astrocytic_up", "neuronal_up", "oligodendrocytic_up") geneSetLabels <- gsub("_", " ", geneSetOrder) hmcol <- colorRampPalette(brewer.pal(10, "RdBu"))(256) hmcol <- hmcol[length(hmcol):1] heatmap(exprs(gbm_es)[geneSetOrder, sampleOrderBySubtype], Rowv=NA, Colv=NA, scale="row", margins=c(3,5), col=hmcol, ColSideColors=rep(subtypeColorLegend[subtypeOrder], times=subtypeXtable[subtypeOrder]), labCol="", gbm_es$subtype[sampleOrderBySubtype], labRow=paste(toupper(substring(geneSetLabels, 1,1)), substring(geneSetLabels, 2), sep=""), cexRow=2, main=" \n ") par(xpd=TRUE) text(0.23,1.21, "Proneural", col="red", cex=1.2) text(0.36,1.21, "Neural", col="green", cex=1.2) text(0.47,1.21, "Classical", col="blue", cex=1.2) text(0.62,1.21, "Mesenchymal", col="orange", cex=1.2) mtext("Gene sets", side=4, line=0, cex=1.5) mtext("Samples ", side=1, line=4, cex=1.5) ## ----------------------------------------------------------------------------- data(geneprotExpCostaEtAl2021) se <- geneExpCostaEtAl2021 se ## ----------------------------------------------------------------------------- gsvaAnnotation(se) <- EntrezIdentifier("org.Hs.eg.db") ## ----------------------------------------------------------------------------- colData(se) table(colData(se)) ## ----genelevelmds, message=FALSE, warning=FALSE, echo=TRUE, small.mar=TRUE, fig.height=4, fig.width=4, dpi=100, fig.cap="Gene-level exploration. Multidimensional scaling (MDS) plot at gene level. Red corresponds to `FIR=yes` and blue to `FIR=no`, while circles and squares correspond, respectively, to female and male neonates."---- library(limma) fircolor <- c(no="skyblue", yes="darkred") sexpch <- c(female=19, male=15) plotMDS(assay(se), col=fircolor[se$FIR], pch=sexpch[se$Sex]) ## ----------------------------------------------------------------------------- innatepat <- c("NKCELL_VS_.+_UP", "MAST_CELL_VS_.+_UP", "EOSINOPHIL_VS_.+_UP", "BASOPHIL_VS_.+_UP", "MACROPHAGE_VS_.+_UP", "NEUTROPHIL_VS_.+_UP") innatepat <- paste(innatepat, collapse="|") innategsets <- names(c7.genesets)[grep(innatepat, names(c7.genesets))] length(innategsets) adaptivepat <- c("CD4_TCELL_VS_.+_UP", "CD8_TCELL_VS_.+_UP", "BCELL_VS_.+_UP") adaptivepat <- paste(adaptivepat, collapse="|") adaptivegsets <- names(c7.genesets)[grep(adaptivepat, names(c7.genesets))] excludepat <- c("NAIVE", "LUPUS", "MYELOID") excludepat <- paste(excludepat, collapse="|") adaptivegsets <- adaptivegsets[-grep(excludepat, adaptivegsets)] length(adaptivegsets) c7.genesets.filt <- c7.genesets[c(innategsets, adaptivegsets)] length(c7.genesets.filt) ## ----------------------------------------------------------------------------- gsvapar <- gsvaParam(se, c7.genesets.filt, assay="logCPM", minSize=5, maxSize=300) ## ----------------------------------------------------------------------------- es <- gsva(gsvapar) es ## ----------------------------------------------------------------------------- assayNames(se) assayNames(es) assay(es)[1:3, 1:3] ## ----------------------------------------------------------------------------- head(lapply(geneSets(es), head)) ## ----------------------------------------------------------------------------- head(geneSetSizes(es)) ## ----pathwaylevelmds, message=FALSE, warning=FALSE, echo=TRUE, small.mar=TRUE, fig.height=4, fig.width=4, dpi=100, fig.cap="Pathway-level exploration. Multidimensional scaling (MDS) plot at pathway level. Red corresponds to `FIR=yes` and blue to `FIR=no`, while circles and squares correspond, respectively, to female and male neonates."---- plotMDS(assay(es), col=fircolor[es$FIR], pch=sexpch[es$Sex]) ## ----message=FALSE, warning=FALSE--------------------------------------------- library(sva) library(limma) ## build design matrix of the model to which we fit the data mod <- model.matrix(~ FIR, colData(es)) ## build design matrix of the corresponding null model mod0 <- model.matrix(~ 1, colData(es)) ## estimate surrogate variables (SVs) with SVA sv <- sva(assay(es), mod, mod0) ## add SVs to the design matrix of the model of interest mod <- cbind(mod, sv$sv) ## fit linear models fit <- lmFit(assay(es), mod) ## calculate moderated t-statistics using the robust regime fit.eb <- eBayes(fit, robust=TRUE) ## summarize the extent of differential expression at 5% FDR res <- decideTests(fit.eb) summary(res) ## ----esstdevxgssize, message=FALSE, warning=FALSE, echo=TRUE, small.mar=TRUE, fig.height=4, fig.width=4, dpi=100, fig.cap="Pathway-level differential expression analysis. Residual standard deviation of GSVA scores as a function of gene set size. Larger gene sets tend to have higher precision."---- gssizes <- geneSetSizes(es) plot(sqrt(gssizes), sqrt(fit.eb$sigma), xlab="Sqrt(gene sets sizes)", ylab="Sqrt(standard deviation)", las=1, pch=".", cex=4) lines(lowess(sqrt(gssizes), sqrt(fit.eb$sigma)), col="red", lwd=2) ## ----------------------------------------------------------------------------- fit.eb.trend <- eBayes(fit, robust=TRUE, trend=gssizes) res <- decideTests(fit.eb.trend) summary(res) ## ----------------------------------------------------------------------------- tt <- topTable(fit.eb.trend, coef=2, n=Inf) DEpwys <- rownames(tt)[tt$adj.P.Val <= 0.05] length(DEpwys) head(DEpwys) ## ----heatmapdepwys, message=FALSE, warning=FALSE, echo=TRUE, fig.height=8, fig.width=10, dpi=100, fig.cap="Pathway-level signature of FIR. Heatmap of GSVA enrichment scores from pathways being called DE with 5% FDR between FIR-affected and unaffected neonates."---- ## get DE pathway GSVA enrichment scores, removing the covariates effect DEpwys_es <- removeBatchEffect(assay(es[DEpwys, ]), covariates=mod[, 2:ncol(mod)], design=mod[, 1:2]) ## cluster samples sam_col_map <- fircolor[es$FIR] names(sam_col_map) <- colnames(DEpwys_es) sampleClust <- hclust(as.dist(1-cor(DEpwys_es, method="spearman")), method="complete") ## cluster pathways gsetClust <- hclust(as.dist(1-cor(t(DEpwys_es), method="pearson")), method="complete") ## annotate pathways whether they are involved in the innate or in ## the adaptive immune response labrow <- rownames(DEpwys_es) mask <- rownames(DEpwys_es) %in% innategsets labrow[mask] <- paste("(INNATE)", labrow[mask], sep="_") mask <- rownames(DEpwys_es) %in% adaptivegsets labrow[mask] <- paste("(ADAPTIVE)", labrow[mask], sep="_") labrow <- gsub("_", " ", gsub("GSE[0-9]+_", "", labrow)) ## pathway expression color scale from blue (low) to red (high) library(RColorBrewer) pwyexpcol <- colorRampPalette(brewer.pal(10, "RdBu"))(256) pwyexpcol <- pwyexpcol[length(pwyexpcol):1] ## generate heatmap heatmap(DEpwys_es, ColSideColors=fircolor[es$FIR], xlab="Samples", ylab="Pathways", margins=c(2, 20), labCol="", labRow=labrow, col=pwyexpcol, scale="row", Colv=as.dendrogram(sampleClust), Rowv=as.dendrogram(gsetClust)) ## ----eval=FALSE--------------------------------------------------------------- # res <- igsva() ## ----session_info, cache=FALSE------------------------------------------------ sessionInfo()