Martin Morgan
February 3, 2015
system.time(), Rprof(), microbenchmarkopen(), read chunk(s), close().yieldSize argument to Rsamtools::BamFile()GenomicFiles::reduceByYield()Rsamtools::ScanBamParam()ShortRead::FastqSampler()lapply()-like operationsbplapply() for lapply()-like functions,
increasingly used by package developers to provide easy, standard
way of gaining parallel evaluation.Write the following as a function. Use system.time() to explore how
long this takes to execute as n increases from 100 to 10000. Use
identical() and microbenchmark to compare
alternatives f1(), f2(), and f3() for both correctness and performance of
these three different functions. What strategies are these functions
using?
f0 <- function(n) {
## inefficient!
ans <- numeric()
for (i in seq_len(n))
ans <- c(ans, exp(i))
ans
}
f1 <- function(n) {
ans <- numeric(n)
for (i in seq_len(n))
ans[[i]] <- exp(i)
ans
}
f2 <- function(n)
sapply(seq_len(n), exp)
f3 <- function(n)
exp(seq_len(n))
Go to sleep for 1 second, then return i. This takes 8 seconds.
library(BiocParallel)
fun <- function(i) {
Sys.sleep(1)
i
}
## serial
f0 <- function(n)
lapply(seq_len(n), fun)
## parallel
f1 <- function(n)
bplapply(seq_len(n), fun)
Iterate through files: GenomicFiles::reduceByYield()
suppressPackageStartupMessages({
library(GenomicFiles)
library(GenomicAlignments)
library(Rsamtools)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
})
yield <- # how to input the next chunk of data
function(X, ...)
{
readGAlignments(X)
}
map <- # what to do to each chunk
function(VALUE, ..., roi)
{
olaps <- findOverlaps(VALUE, roi, type="within", ignore.strand=TRUE)
count <- tabulate(subjectHits(olaps), subjectLength(olaps))
setNames(count, names(roi))
}
reduce <- # how to combine mapped chunks
`+`
Improvement: “yield factory” to keep track of how many records input
yieldFactory <- # return a function with local state
function()
{
n_records <- 0L
function(X, ...) {
aln <- readGAlignments(X)
n_records <<- n_records + length(aln)
message(n_records)
aln
}
}
Regions of interest, named like the chromosomes in the bam file.
exByTx <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "tx")
map0 <- read.delim("~/igv/genomes/hg19_alias.tab", header=FALSE,
stringsAsFactors=FALSE)
seqlevels(exByTx, force=TRUE) <- setNames(map0$V1, map0$V2)
A function to iterate through a bam file
count1 <- function(filename, roi) {
message(filename)
## Create and open BAM file
bf <- BamFile(filename, yieldSize=1000000)
reduceByYield(bf, yieldFactory(), map, reduce, roi=roi)
}
In action
filename <- "~/bam/SRR1039508_sorted.bam"
count <- count1(filename, exByTx)
Parallelize
library(BiocParallel)
## all bam files
filenames <- dir("~/bam", pattern="bam$", full=TRUE)
names(filenames) <- sub("_sorted.bam", "", basename(filenames))
## iterate
counts <- bplapply(filenames, count1, exByTx)
counts <- simplify2array(counts)
head(counts)