## ----eval=TRUE, warning=FALSE, message=FALSE---------------------------------- # Import required packages library(alakazam) library(dplyr) library(ggplot2) library(shazam) # Load and subset example data (for speed) data(ExampleDb, package="alakazam") set.seed(112) db <- ExampleDb %>% sample_n(size=500) db %>% count(sample_id) ## ----eval=TRUE, warning=FALSE------------------------------------------------- # Use nucleotide Hamming distance and normalize by junction length dist_ham <- distToNearest(db %>% filter(sample_id == "+7d"), sequenceColumn="junction", vCallColumn="v_call_genotyped", jCallColumn="j_call", model="ham", normalize="len", nproc=1) # Use genotyped V assignments, a 5-mer model and no normalization dist_s5f <- distToNearest(db %>% filter(sample_id == "+7d"), sequenceColumn="junction", vCallColumn="v_call_genotyped", jCallColumn="j_call", model="hh_s5f", normalize="none", nproc=1) ## ----eval=FALSE, warning=FALSE------------------------------------------------ # # Single-cell mode # # Group cells in a one-stage process (VJthenLen=FALSE) and using # # both heavy and light chain sequences (onlyHeavy=FALSE) # # data(Example10x, package="alakazam") # dist_sc <- distToNearest(Example10x, cellIdColumn="cell_id", locusColumn="locus", # VJthenLen=FALSE, onlyHeavy=FALSE) ## ----eval=TRUE, warning=FALSE, fig.width=7------------------------------------ # Generate Hamming distance histogram p1 <- ggplot(subset(dist_ham, !is.na(dist_nearest)), aes(x=dist_nearest)) + geom_histogram(color="white", binwidth=0.02) + geom_vline(xintercept=0.12, color="firebrick", linetype=2) + labs(x = "Hamming distance", y = "Count") + scale_x_continuous(breaks=seq(0, 1, 0.1)) + theme_bw() plot(p1) ## ----eval=TRUE, warning=FALSE, fig.width=7------------------------------------ # Generate HH_S5F distance histogram p2 <- ggplot(subset(dist_s5f, !is.na(dist_nearest)), aes(x=dist_nearest)) + geom_histogram(color="white", binwidth=1) + geom_vline(xintercept=7, color="firebrick", linetype=2) + labs(x = "HH_S5F distance", y = "Count") + scale_x_continuous(breaks=seq(0, 50, 5)) + theme_bw() plot(p2) ## ----eval=TRUE, warning=FALSE, fig.width=7------------------------------------ # Find threshold using density method output <- findThreshold(dist_ham$dist_nearest, method="density") threshold <- output@threshold # Plot distance histogram, density estimate and optimum threshold plot(output, title="Density Method") # Print threshold print(output) ## ----eval=TRUE, warning=FALSE, fig.width=7------------------------------------ # Find threshold using gmm method output <- findThreshold(dist_ham$dist_nearest, method="gmm", model="gamma-gamma") # Plot distance histogram, Gaussian fits, and optimum threshold plot(output, binwidth=0.02, title="GMM Method: gamma-gamma") # Print threshold print(output) ## ----fields, eval=TRUE, warning=FALSE----------------------------------------- dist_fields <- distToNearest(db, model="ham", normalize="len", fields="sample_id", nproc=1) ## ----eval=TRUE, warning=FALSE, fig.width=7------------------------------------ # Generate grouped histograms p4 <- ggplot(subset(dist_fields, !is.na(dist_nearest)), aes(x=dist_nearest)) + geom_histogram(color="white", binwidth=0.02) + geom_vline(xintercept=0.12, color="firebrick", linetype=2) + labs(x = "Grouped Hamming distance", y = "Count") + facet_grid(sample_id ~ ., scales="free_y") + theme_bw() plot(p4) ## ----cross, eval=TRUE, warning=FALSE------------------------------------------ dist_cross <- distToNearest(ExampleDb, sequenceColumn="junction", vCallColumn="v_call_genotyped", jCallColumn="j_call", model="ham", first=FALSE, normalize="len", cross="sample_id", nproc=1) ## ----eval=TRUE, warning=FALSE, fig.width=7------------------------------------ # Generate cross sample histograms p5 <- ggplot(subset(dist_cross, !is.na(cross_dist_nearest)), aes(x=cross_dist_nearest)) + labs(x = "Cross-sample Hamming distance", y = "Count") + geom_histogram(color="white", binwidth=0.02) + geom_vline(xintercept=0.12, color="firebrick", linetype=2) + facet_grid(sample_id ~ ., scales="free_y") + theme_bw() plot(p5) ## ----subsample, eval=TRUE, warning=FALSE-------------------------------------- # Explore V-J-junction length groups sizes to use subsample # Show the size of the largest groups top_10_sizes <- ExampleDb %>% group_by(junction_length) %>% # Group by junction length do(alakazam::groupGenes(., first=TRUE)) %>% # Group by V and J call mutate(GROUP_ID=paste(junction_length, vj_group, sep="_")) %>% # Create group ids ungroup() %>% group_by(GROUP_ID) %>% # Group by GROUP_ID distinct(junction) %>% # Count unique junctions per group summarize(SIZE=n()) %>% # Get the size of the group arrange(desc(SIZE)) %>% # Sort by decreasing size select(SIZE) %>% top_n(10) # Filter to the top 10 top_10_sizes # Use 30 to subsample # NOTE: This is a toy example. Subsampling to 30 sequence with real data is unwise dist <- distToNearest(ExampleDb, sequenceColumn="junction", vCallColumn="v_call_genotyped", jCallColumn="j_call", model="ham", first=FALSE, normalize="len", subsample=30)