library(scater)
library(ggplot2)
library(BiocParallel)
library(blase)
library(limma)
library(ggVennDiagram)
RNGversion("3.5.0")
#> Warning in RNGkind("Mersenne-Twister", "Inversion", "Rounding"): non-uniform
#> 'Rounding' sampler used
SEED <- 7
set.seed(SEED)
N_CORES <- 2
bpparam <- MulticoreParam(N_CORES)
This article will show how BLASE can be used for excluding the effect of developmental differences when analysing bulk RNA-seq data. We make use of scRNA-seq Dogga et al. 2024 and bulk RNA-seq data from Zhang et al. 2021. Code for generating the objects used here is available in the BLASE reproducibility repository.
First we’ll load in the data we’re using, pre-prepared from the BLASE reproducibility repository.
data(zhang_2021_heat_shock_bulk, package = "blase")
data(MCA_PF_SCE, package = "blase")
We can examine the true lifecycle stages, and also the calculated pseudotime trajectory (Slingshot).
plotUMAP(MCA_PF_SCE, colour_by = "STAGE_HR")
#| fig.alt: >
#| UMAP of Dogga et al. single-cell RNA-seq reference coloured by pseudotime,
#| starting with Rings, and ending with Schizonts.
plotUMAP(MCA_PF_SCE, colour_by = "slingPseudotime_1")
## Prepare BLASE
Now we’ll prepare BLASE for use.
First, we create the object, giving it the number of bins we want to use, and how to calculate them.
blase_data <- as.BlaseData(
MCA_PF_SCE,
pseudotime_slot = "slingPseudotime_1",
n_bins = 8,
split_by = "pseudotime_range"
)
# Add these bins to the sc metadata
MCA_PF_SCE <- assign_pseudotime_bins(MCA_PF_SCE,
pseudotime_slot = "slingPseudotime_1",
n_bins = 8,
split_by = "pseudotime_range"
)
Now we will select the genes we want to use, using BLASE’s gene peakedness metric.
gene_peakedness_info <- calculate_gene_peakedness(
MCA_PF_SCE,
window_pct = 5,
knots = 18,
BPPARAM = bpparam
)
genes_to_use <- gene_peakedness_spread_selection(
MCA_PF_SCE,
gene_peakedness_info,
genes_per_bin = 30,
n_gene_bins = 30
)
head(gene_peakedness_info)
#> gene peak_pseudotime mean_in_window mean_out_window ratio
#> 28 PF3D7-1401100 3.8311312 5.793085 1.528934 3.788969
#> 44 PF3D7-1401200 6.0203490 4.791942 1.921439 2.493934
#> 29 PF3D7-1401400 3.9679573 696.174509 39.689901 17.540344
#> 84 PF3D7-1401600 11.4933936 93.076303 7.631160 12.196875
#> 1 PF3D7-1401900 0.1368261 43.270750 3.321464 13.027613
#> 80 PF3D7-1402200 10.9460892 3.754015 1.554102 2.415553
#> window_start window_end deviance_explained
#> 28 3.4890659 4.1731965 0.1881312
#> 44 5.6782838 6.3624143 0.2035355
#> 29 3.6258920 4.3100226 0.2513795
#> 84 11.1513283 11.8354589 0.6891177
#> 1 -0.2052392 0.4788914 0.3846938
#> 80 10.6040239 11.2881544 0.1768597
Here, we add them to the BLASE object for mapping with.
genes(blase_data) <- genes_to_use
Now we can perform the actual mapping step, and review the results.
mapping_results <- map_all_best_bins(
blase_data,
zhang_2021_heat_shock_bulk,
BPPARAM = bpparam
)
plot_mapping_result_heatmap(mapping_results)
We calculate DE genes using Limma (ref) as we have access to only normalised counts.
metadata <- data.frame(row.names = seq_len(12))
metadata$strain <- c(
rep("NF54", 4),
rep("PB4", 4),
rep("PB31", 4)
)
metadata$growth_conditions <- rep(c("Normal", "Normal", "HS", "HS"), 3)
metadata$group <- paste0(metadata$strain, "_", metadata$growth_conditions)
rownames(metadata) <- c(
"NF54_37_1",
"NF54_37_2",
"NF54_41_1",
"NF54_41_2",
"PB4_37_1",
"PB4_37_2",
"PB4_41_1",
"PB4_41_2",
"PB31_37_1",
"PB31_37_2",
"PB31_41_1",
"PB31_41_2"
)
design <- model.matrix(~ 0 + group, metadata)
colnames(design) <- gsub("group", "", colnames(design))
contr.matrix <- makeContrasts(
WT_normal_v_HS = NF54_Normal - NF54_HS,
PB31_normal_v_HS = PB31_Normal - PB31_HS,
normal_NF54_v_PB31 = NF54_Normal - PB31_Normal,
levels = colnames(design)
)
vfit <- lmFit(zhang_2021_heat_shock_bulk, design)
vfit <- contrasts.fit(vfit, contrasts = contr.matrix)
efit <- eBayes(vfit)
summary(decideTests(efit))
#> WT_normal_v_HS PB31_normal_v_HS normal_NF54_v_PB31
#> Down 118 118 179
#> NotSig 822 616 767
#> Up 50 256 44
PB31_normal_v_HS_BULK_DE <- topTable(efit, n = Inf, coef = 2)
PB31_normal_v_HS_BULK_DE <- PB31_normal_v_HS_BULK_DE[PB31_normal_v_HS_BULK_DE$adj.P.Val < 0.05 & PB31_normal_v_HS_BULK_DE$logFC > 0, ]
PB31_normal_v_HS_BULK_DE <- PB31_normal_v_HS_BULK_DE[order(-PB31_normal_v_HS_BULK_DE$logFC), ]
First, let’s pseudobulk based on the pseudotime bins
pseudobulked_MCA_PF_SCE <- data.frame(row.names = rownames(MCA_PF_SCE))
for (r in mapping_results) {
pseudobulked_MCA_PF_SCE[bulk_name(r)] <- rowSums(counts(MCA_PF_SCE[, MCA_PF_SCE$pseudotime_bin == best_bin(r)]))
}
pseudobulked_MCA_PF_SCE <- as.matrix(pseudobulked_MCA_PF_SCE)
And now calculate DE genes for these
## Normalise, not needed for true bulks
par(mfrow = c(1, 2))
v <- voom(pseudobulked_MCA_PF_SCE, design, plot = FALSE)
vfit_sc <- lmFit(v, design)
vfit_sc <- contrasts.fit(vfit_sc, contrasts = contr.matrix)
efit_sc <- eBayes(vfit_sc)
summary(decideTests(efit_sc))
#> WT_normal_v_HS PB31_normal_v_HS normal_NF54_v_PB31
#> Down 0 257 127
#> NotSig 1746 1370 1343
#> Up 0 119 276
PB31_normal_v_HS_SC_DE <- topTable(efit_sc, n = Inf, coef = 2)
PB31_normal_v_HS_SC_DE <- PB31_normal_v_HS_SC_DE[PB31_normal_v_HS_SC_DE$adj.P.Val < 0.05 & PB31_normal_v_HS_SC_DE$logFC > 0, ]
PB31_normal_v_HS_SC_DE <- PB31_normal_v_HS_SC_DE[order(-PB31_normal_v_HS_SC_DE$logFC), ]
How many of these genes overlap? We can look at the intersection using a Venn Diagram:
ggVennDiagram(list(
Phenotype = rownames(PB31_normal_v_HS_BULK_DE),
Development = rownames(PB31_normal_v_HS_SC_DE)
)) + ggtitle("PB31 Normal vs Heat Shock")
Great, there are genes which we can correct. We can now remove these from the original DE genes list.
NB: This is a primitive approach, and there are likely many other and better ways to calculate which genes should be kept or removed.
PB31_normal_v_HS_corrected_DE <- rownames(PB31_normal_v_HS_BULK_DE[!(rownames(PB31_normal_v_HS_BULK_DE) %in% rownames(PB31_normal_v_HS_SC_DE)), ])
head(PB31_normal_v_HS_corrected_DE)
#> [1] "PF3D7-0725400" "PF3D7-0905400" "PF3D7-1010300" "PF3D7-0731500"
#> [5] "PF3D7-0913800" "PF3D7-1017500"
Further downstream analysis can now be performed, as desired by the researcher, for example Gene Ontology term enrichment.
sessionInfo()
#> R version 4.5.1 Patched (2025-08-23 r88802)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.3 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.22-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> Random number generation:
#> RNG: Mersenne-Twister
#> Normal: Inversion
#> Sample: Rounding
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] ggVennDiagram_1.5.4 limma_3.66.0
#> [3] patchwork_1.3.2 blase_1.0.0
#> [5] BiocParallel_1.44.0 scater_1.38.0
#> [7] ggplot2_4.0.0 scuttle_1.20.0
#> [9] SingleCellExperiment_1.32.0 SummarizedExperiment_1.40.0
#> [11] Biobase_2.70.0 GenomicRanges_1.62.0
#> [13] Seqinfo_1.0.0 IRanges_2.44.0
#> [15] S4Vectors_0.48.0 BiocGenerics_0.56.0
#> [17] generics_0.1.4 MatrixGenerics_1.22.0
#> [19] matrixStats_1.5.0 BiocStyle_2.38.0
#>
#> loaded via a namespace (and not attached):
#> [1] RcppAnnoy_0.0.22 splines_4.5.1 later_1.4.4
#> [4] tibble_3.3.0 polyclip_1.10-7 fastDummies_1.7.5
#> [7] lifecycle_1.0.4 globals_0.18.0 lattice_0.22-7
#> [10] MASS_7.3-65 magrittr_2.0.4 plotly_4.11.0
#> [13] sass_0.4.10 rmarkdown_2.30 jquerylib_0.1.4
#> [16] yaml_2.3.10 httpuv_1.6.16 otel_0.2.0
#> [19] Seurat_5.3.1 sctransform_0.4.2 spam_2.11-1
#> [22] sp_2.2-0 spatstat.sparse_3.1-0 reticulate_1.44.0
#> [25] cowplot_1.2.0 pbapply_1.7-4 RColorBrewer_1.1-3
#> [28] abind_1.4-8 Rtsne_0.17 purrr_1.1.0
#> [31] ggrepel_0.9.6 irlba_2.3.5.1 listenv_0.9.1
#> [34] spatstat.utils_3.2-0 goftest_1.2-3 RSpectra_0.16-2
#> [37] spatstat.random_3.4-2 fitdistrplus_1.2-4 parallelly_1.45.1
#> [40] codetools_0.2-20 DelayedArray_0.36.0 tidyselect_1.2.1
#> [43] farver_2.1.2 ScaledMatrix_1.18.0 viridis_0.6.5
#> [46] spatstat.explore_3.5-3 jsonlite_2.0.0 BiocNeighbors_2.4.0
#> [49] progressr_0.17.0 ggridges_0.5.7 survival_3.8-3
#> [52] tools_4.5.1 ica_1.0-3 Rcpp_1.1.0
#> [55] glue_1.8.0 gridExtra_2.3 SparseArray_1.10.0
#> [58] xfun_0.53 mgcv_1.9-3 dplyr_1.1.4
#> [61] withr_3.0.2 BiocManager_1.30.26 fastmap_1.2.0
#> [64] boot_1.3-32 digest_0.6.37 rsvd_1.0.5
#> [67] R6_2.6.1 mime_0.13 scattermore_1.2
#> [70] tensor_1.5.1 dichromat_2.0-0.1 spatstat.data_3.1-9
#> [73] tidyr_1.3.1 data.table_1.17.8 httr_1.4.7
#> [76] htmlwidgets_1.6.4 S4Arrays_1.10.0 uwot_0.2.3
#> [79] pkgconfig_2.0.3 gtable_0.3.6 lmtest_0.9-40
#> [82] S7_0.2.0 XVector_0.50.0 htmltools_0.5.8.1
#> [85] dotCall64_1.2 bookdown_0.45 SeuratObject_5.2.0
#> [88] scales_1.4.0 png_0.1-8 spatstat.univar_3.1-4
#> [91] knitr_1.50 reshape2_1.4.4 nlme_3.1-168
#> [94] cachem_1.1.0 zoo_1.8-14 stringr_1.5.2
#> [97] KernSmooth_2.23-26 parallel_4.5.1 miniUI_0.1.2
#> [100] vipor_0.4.7 pillar_1.11.1 grid_4.5.1
#> [103] vctrs_0.6.5 RANN_2.6.2 promises_1.4.0
#> [106] BiocSingular_1.26.0 beachmat_2.26.0 xtable_1.8-4
#> [109] cluster_2.1.8.1 beeswarm_0.4.0 evaluate_1.0.5
#> [112] tinytex_0.57 magick_2.9.0 cli_3.6.5
#> [115] compiler_4.5.1 rlang_1.1.6 future.apply_1.20.0
#> [118] labeling_0.4.3 plyr_1.8.9 ggbeeswarm_0.7.2
#> [121] stringi_1.8.7 viridisLite_0.4.2 deldir_2.0-4
#> [124] lazyeval_0.2.2 spatstat.geom_3.6-0 Matrix_1.7-4
#> [127] RcppHNSW_0.6.0 sparseMatrixStats_1.22.0 future_1.67.0
#> [130] statmod_1.5.1 shiny_1.11.1 ROCR_1.0-11
#> [133] igraph_2.2.1 bslib_0.9.0