## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  crop = knitr::hook_pdfcrop
)
suppressMessages({
  library(data.table)
  library(tigger)
  library(piglet)
  library(htmltools)
})

## ----echo=FALSE---------------------------------------------------------------
htmltools::img(src = knitr::image_uri("piglet_logo.svg"), 
               alt = 'logo', 
               style = 'position:absolute; top:0; right:0; padding:10px;border: none !important;')

## ----plot-amplicon, fig.width=11, fig.height=2, echo=FALSE, fig.cap="**V library amplicon length.** Each row is a different V coverage, S1 for full length, S2 for BIOMED-2 primers, and S3 for adaptive coverage. The colors indicates the V regions according to IMGT numbering, where dark gray represents the IMGT gaps."----

FWR1 <- "caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttct"
CDR1 <- "ggatacaccttc............accggctactat"
FWR2 <- "atgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacgg"
CDR2 <- "atcaaccctaac......agtggtggcaca"
FWR3 <- "aactatgcacagaagtttcag...ggcagggtcaccagtaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgt"
CDR3 <- "gcgagaga"

regions <- c(FWR1 = FWR1, CDR1 = CDR1, FWR2 = FWR2, CDR2 = CDR2, FWR3 = FWR3, CDR3 = CDR3)
full_seq <- paste(regions, collapse = "")

# Find primer location in ungapped sequence
primer <- "GGCCTCAGTGAAGGTCTCCTGCAAG"
seq_ungapped <- gsub("[.]", "", toupper(full_seq))
loc <- unlist(aregexec(primer, seq_ungapped, max.distance = 4))
cutoff_ungapped <- loc[1] + nchar(primer)

# Map positions to regions and handle masking
seq_chars <- unlist(strsplit(toupper(full_seq), ""))
base_vals <- ifelse(seq_chars == ".", "gap", rep(names(regions), nchar(regions)))

# Calculate start index in gapped sequence (index of the cutoff_ungapped-th non-dot char)
cutoff_s2 <- which(seq_chars != ".")[cutoff_ungapped]

# Prepare Data
s1 <- base_vals
s2 <- s1; s2[seq_len(cutoff_s2 - 1)] <- NA
s3 <- s1; s3[seq_len(224)] <- NA

df <- data.frame(S1 = s1, S2 = s2, S3 = s3)
df$Position <- seq_len(nrow(df))
df_long <- tidyr::pivot_longer(df, cols = c("S1", "S2", "S3"), names_to = "Protocol", values_to = "Region")

# Factor Ordering
df_long$Protocol <- factor(df_long$Protocol, levels = c("S3", "S2", "S1"))
df_long$Region <- factor(df_long$Region, levels = c(names(regions), "gap"))

# Plot
fam_col <- c("brown4", "darkblue", "darkorchid4", "darkgreen", "firebrick", "darkorange3")
color_map <- structure(c(fam_col, "#00000099"), names = levels(df_long$Region))

ggplot2::ggplot(df_long, ggplot2::aes(x = Position, y = Protocol, fill = Region)) +
  ggplot2::geom_tile() +
  ggplot2::scale_fill_manual(values = color_map, na.value = "gray90") +
  ggplot2::theme_minimal() +
  ggplot2::labs(x = "Position", y = "Coverage Protocol") +
  ggplot2::theme(axis.text.y = ggplot2::element_text(size = 12, face = "bold"),
                 legend.position = "bottom",
                 panel.grid.major = ggplot2::element_blank())


## ----germline-----------------------------------------------------------------
library(piglet)
data(HVGERM)

## ----functionality------------------------------------------------------------
data(hv_functionality)

## ----lst-germlineset----------------------------------------------------------
germline <- HVGERM
## keep only functional alleles
germline <- germline[hv_functionality$allele[hv_functionality$functional=="F"]]
## keep only alleles that start from the first position of the V sequence
germline <- germline[!grepl("^[.]", germline)]
## keep only alleles that are at minimum 318 nucleotide long
germline <- germline[nchar(germline) >= 318]
## keep only localized alleles (remove NL)
germline <- germline[!grepl("NL", names(germline))]

## ----lst-germlineset-code, ref.label='lst-germlineset', anchor="block", eval=FALSE----
# germline <- HVGERM
# ## keep only functional alleles
# germline <- germline[hv_functionality$allele[hv_functionality$functional=="F"]]
# ## keep only alleles that start from the first position of the V sequence
# germline <- germline[!grepl("^[.]", germline)]
# ## keep only alleles that are at minimum 318 nucleotide long
# germline <- germline[nchar(germline) >= 318]
# ## keep only localized alleles (remove NL)
# germline <- germline[!grepl("NL", names(germline))]

## ----lst-asc, fig.cap=""------------------------------------------------------
asc <- inferAlleleClusters(
  germline_set = germline, 
  trim_3prime_side = 318, 
  mask_5prime_side = 0, 
  family_threshold = 75, 
  allele_cluster_threshold = 95)

## ----asc-plot, fig.height=18, fig.width=15, fig.cap="**Allele similarity clusters.** The out most circle is the allele names, the second layer are the ASC groups, each group is labeled and colored. The third circle is the clustering dendrogram, the branches are colored by the ASC families. The blue and orange dashed lines are the 95% and 75% similarity ASC threshold."----
plot(asc)

## ----frw1---------------------------------------------------------------------
germline_frw1 <- artificialFRW1Germline(germline, mask_primer = T)

## ----eval=FALSE---------------------------------------------------------------
# zenodo_doi <- "10.5281/zenodo.7401189"
# asc_archive <-
#   recentAlleleClusters(doi = zenodo_doi, get_file = TRUE)

## ----eval=FALSE---------------------------------------------------------------
# allele_cluster_table <- extractASCTable(archive_file = asc_archive)

## ----echo=FALSE---------------------------------------------------------------
allele_cluster_table <- read.delim('asc_alleles_table.tsv',sep='\t')

## ----echo=FALSE---------------------------------------------------------------
asc_tables <- dplyr::left_join(
  asc$alleleClusterTable %>%
    dplyr::select(new_allele, imgt_allele) %>% dplyr::group_by(new_allele) %>%
    dplyr::summarise(imgt_allele = paste0(sort(
      unique(imgt_allele)
    ), collapse = "/")),
  allele_cluster_table %>% dplyr::select(new_allele, imgt_allele) %>% dplyr::group_by(new_allele) %>%
    dplyr::summarise(imgt_allele = paste0(sort(
      unique(imgt_allele)
    ), collapse = "/")),
  by = "new_allele",
  suffix = c(".piglet", ".zenodo")
)

## -----------------------------------------------------------------------------
# loading TIgGER AIRR-seq b cell data
data <- tigger::AIRRDb

## -----------------------------------------------------------------------------
allele_cluster_table <-
  allele_cluster_table %>% dplyr::group_by(new_allele, func_group, thresh) %>%
  dplyr::summarise(imgt_allele = paste0(sort(unique(imgt_allele)), collapse = "/"),
                   .groups = "keep")

## -----------------------------------------------------------------------------
# storing original v_call values
data$v_call_or <- data$v_call
# assigning the ASC alleles
asc_data <- assignAlleleClusters(data, allele_cluster_table)

head(asc_data[, c("v_call", "v_call_or")])

## -----------------------------------------------------------------------------
# reforming the germline set
asc_germline <- germlineASC(allele_cluster_table, germline = HVGERM)

## -----------------------------------------------------------------------------
# inferring the genotype
asc_genotype <- inferGenotypeAllele_asc(
  asc_data,
  alleleClusterTable = allele_cluster_table,
  germline_db = asc_germline,
  find_unmutated = T
)

head(asc_genotype)

## -----------------------------------------------------------------------------
# get the genotype alleles
alleles <- unlist(strsplit(asc_genotype$genotyped_imgt_alleles, ","))
# get the genes
genes <- gsub("[*][0-9]+", "", alleles)
# extract the alleles
alleles <- sapply(strsplit(alleles, "[*]"), "[[", 2)
# make sure to extract only alleles
alleles <- gsub("([0-9]+).*$", "\\1", alleles)
# create the genotype
genotype <- data.frame(alleles = alleles, gene = genes)
# plot the genotype
tigger::plotGenotype(genotype = genotype)