## ----setup, include=FALSE----------------------------------------------------- library(BiocStyle) ## ----eval=FALSE--------------------------------------------------------------- # if (!require("BiocManager", quietly=TRUE)) # install.packages("BiocManager") # BiocManager::install("igblastr") ## ----message=FALSE------------------------------------------------------------ library(igblastr) ## ----------------------------------------------------------------------------- if (!has_igblast()) install_igblast() ## ----------------------------------------------------------------------------- igblast_info() ## ----------------------------------------------------------------------------- list_germline_dbs(builtin.only=TRUE) ## ----------------------------------------------------------------------------- head(list_IMGT_releases()) ## ----------------------------------------------------------------------------- list_IMGT_organisms("202506-1") ## ----message=FALSE------------------------------------------------------------ install_IMGT_germline_db("202506-1", organism="Homo sapiens", force=TRUE) ## ----------------------------------------------------------------------------- use_germline_db("IMGT-202506-1.Homo_sapiens.IGH+IGK+IGL") ## ----------------------------------------------------------------------------- list_germline_dbs() ## ----------------------------------------------------------------------------- list_c_region_dbs() ## ----------------------------------------------------------------------------- use_c_region_db("_IMGT.human.IGH+IGK+IGL.202412") ## ----------------------------------------------------------------------------- query <- system.file(package="igblastr", "extdata", "1279067_1_Paired_sequences.fasta.gz") ## ----------------------------------------------------------------------------- json <- system.file(package="igblastr", "extdata", "1279067_1_Paired_All.json") query_metadata <- jsonlite::fromJSON(json) query_metadata ## ----------------------------------------------------------------------------- use_germline_db() use_c_region_db() ## ----------------------------------------------------------------------------- AIRR_df <- igblastn(query, num_alignments_V=1) ## ----------------------------------------------------------------------------- AIRR_df ## ----message=FALSE------------------------------------------------------------ library(ggplot2) library(dplyr) library(scales) library(ggseqlogo) ## ----------------------------------------------------------------------------- AIRR_df |> ggplot(aes(locus, 100 - v_identity)) + theme_bw(base_size=14) + geom_point(position = position_jitter(width = 0.3), alpha = 0.1) + geom_boxplot(color = "blue", fill = NA, outliers = FALSE, alpha = 0.3) + ggtitle("Distribution of V percent mutation by locus") + xlab(NULL) ## ----------------------------------------------------------------------------- plot_gene_dist <- function(AIRR_df, loc) { df_v_gene <- AIRR_df |> filter(locus == loc) |> mutate(v_gene = sub("\\*[0-9]*", "", v_call)) |> # drop allele info group_by(v_gene) |> summarize(n = n(), .groups = "drop") |> mutate(frac = n / sum(n)) df_v_gene |> ggplot(aes(frac, v_gene)) + theme_bw(base_size=13) + geom_col() + scale_x_continuous('Percent of sequences', labels = scales::percent) + ylab("Germline gene") + ggtitle(paste0(loc, "V gene prevalence")) } ## ----fig.height=8------------------------------------------------------------- plot_gene_dist(AIRR_df, "IGH") ## ----fig.height=5.9----------------------------------------------------------- plot_gene_dist(AIRR_df, "IGK") ## ----fig.height=5.3----------------------------------------------------------- plot_gene_dist(AIRR_df, "IGL") ## ----fig.height=4.5----------------------------------------------------------- AIRR_df$cdr3_aa_length <- nchar(AIRR_df$cdr3_aa) AIRR_df |> group_by(locus, cdr3_aa_length) |> summarize(n = n(), .groups = "drop") |> ggplot(aes(cdr3_aa_length, n)) + theme_bw(base_size=14) + facet_wrap(~locus) + geom_col() + ggtitle("Histograms of CDR3 length by locus") ## ----------------------------------------------------------------------------- AIRR_df |> filter(locus == "IGK", cdr3_aa_length == 9) |> pull(cdr3_aa) |> ggseqlogo(method = "probability") + theme_bw(base_size=14) + ggtitle("Logo plot of kappa chain CDR3 sequences that are 9 AA long") ## ----------------------------------------------------------------------------- sessionInfo()