## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( fig.path = "figures/", dev = "png", dpi = 300, fig.width = 7, fig.height = 4, fig.align = "center", out.width = "70%", message = FALSE, warning = FALSE ) ## ----installation, eval=FALSE------------------------------------------------- # # if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install("DspikeIn") # # library(DspikeIn) ## ----eval=TRUE---------------------------------------------------------------- library(SummarizedExperiment) library(DspikeIn) # DspikeIn requirements # -------------------- # DspikeIn operates on a taxonomy table with exactly seven ranks: # Kingdom Phylum Class Order Family Genus Species expected <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") actual <- colnames(as.data.frame(SummarizedExperiment::rowData(tse))) if (!identical(expected, actual)) { stop( paste0( "DspikeIn requires exactly 7 taxonomic ranks in this order:\n ", paste(expected, collapse = " "), "\n\nYour taxonomy columns are:\n ", paste(actual, collapse = ", "), "\n\nPlease remove or rename extra ranks (e.g., 'Strain', 'Subfamily') ", "to match the required format." ) ) } ## ----eval=TRUE---------------------------------------------------------------- library(phyloseq) # Get path to external data folder extdata_path <- system.file("extdata", package = "DspikeIn") list.files(extdata_path) # and data("physeq_16SOTU", package = "DspikeIn") length(taxa_names(physeq_16SOTU)) new_ids <- paste0("OTU", seq_along(taxa_names(physeq_16SOTU))) taxa_names(physeq_16SOTU) <- new_ids # Verify head(taxa_names(physeq_16SOTU)) # [1] "OTU1" "OTU2" "OTU3" "OTU4" "OTU5" "OTU6" tse_16SOTU <- convert_phyloseq_to_tse(physeq_16SOTU) tse_16SOTU <- tidy_phyloseq_tse(tse_16SOTU) ## ----------------------------------------------------------------------------- # Use the Neighbor-Joining method based on a Jukes-Cantor distance matrix and plot the tree with bootstrap values. # we compare the Sanger read of Tetragenococcus halophilus with the FASTA sequence of Tetragenococcus halophilus from our phyloseq object. library(Biostrings) library(TreeSummarizedExperiment) library(SummarizedExperiment) library(DspikeIn) # Filter TSE object to keep only Bacteria and Archaea tse_16SOTU <- tse_16SOTU[ rowData(tse_16SOTU)$Kingdom %in% c("Bacteria", "Archaea"),] library(Biostrings) # # # Subset the TSE object to include only Tetragenococcus # tetragenococcus_tse <- tse_16SOTU[ # rowData(tse_16SOTU)$Genus == "Tetragenococcus" & # !is.na(rownames(tse_16SOTU)) & # rownames(tse_16SOTU) != "", ] # # ref_sequences <- referenceSeq(tetragenococcus_tse) # # # Convert to DNAStringSet if needed # ref_sequences <- Biostrings::DNAStringSet(ref_sequences) # Biostrings::writeXStringSet(ref_sequences, filepath = "Sample.fasta") ref_fasta <- system.file("extdata", "Ref.fasta", package = "DspikeIn") sample_fasta <- system.file("extdata", "Sample.fasta", package = "DspikeIn") result <- validate_spikein_clade( reference_fasta = ref_fasta, sample_fasta = sample_fasta, bootstrap = 200, output_prefix = NULL) result$tree_plot ## ----fig.align='center', fig.width=7, fig.height=5, out.width='50%', dpi=120---- library(ggstar) library(ggplot2) # Filter your object to only include spike-in taxa beforehand: # We changed the OTU IDs for easy detection # Big stars = detected in many samples # Small stars = rarely detected # log10(Mean Abundance) Bars= Color intensity reflects mean abundance. # The log-transformed average abundance of each OTU across all samples where it appears. # Extreme blue may signal unintended over-representation. # Metadata Ring = factor of your interest e.g. Animal.type # Each OTU is colored by where it was observed. # Branch length numbers= Actual evolutionary distances (small = very similar) library(DspikeIn) library(TreeSummarizedExperiment) # ---- 1. Subset taxa where Genus is Tetragenococcus ---- spikein_tse <- tse_16SOTU[ rowData(tse_16SOTU)$Genus == "Tetragenococcus", ] # ---- 2. Diagnostic plot (tree-based) ---- ps <- plot_spikein_tree_diagnostic( obj = spikein_tse, metadata_var = "Animal.type", save_plot = FALSE ) print(ps) ## ----------------------------------------------------------------------------- # PREREQUISITE FOR 16S & CALCULATE SPIKED % # Define spiked species and related parameters** library(flextable) library(DspikeIn) library(TreeSummarizedExperiment) library(SummarizedExperiment) library(dplyr) # Define spike-in parameters spiked_cells <- 1847 species_name <- spiked_species <- c("Tetragenococcus_halophilus", "Tetragenococcus_sp.") merged_spiked_species <- "Tetragenococcus" # If you prefer Genus-level matching # species_name <- spiked_species <- c("Tetragenococcus") # merged_spiked_species <- "Tetragenococcus" # -------------------------------------------- # Subset taxa for the spike-in species tse_16SOTU_spiked_taxa <- tse_16SOTU[rowData(tse_16SOTU)$Species %in% species_name, ] # Subset samples based on spike-in volume (e.g., "1" or "2") tse_16SOTU_spiked_samples <- tse_16SOTU[, colData(tse_16SOTU)$spiked.volume %in% c("2", "1") ] # -------------------------------------------- # Merge spiked OTUs using the DspikeIn function # merges monophyletic ASVs/OTUs # The function Pre_processing_species() merges ASVs/OTUs of an spiked species using "sum" or "max" methods, # preserving taxonomic, phylogenetic, and sequencing data. output_rds <- file.path(tempdir(), "merged_tse_sum.rds") Spiked_16S_sum_scaled <- Pre_processing_species( tse_16SOTU_spiked_samples, species_name, merge_method = "sum", output_file = output_rds) ## ----------------------------------------------------------------------------- # * Calculate the percentage of spiked species retrieval per sample* library(mia) library(dplyr) library(SummarizedExperiment) # Subset TSE to remove blanks Spiked_16S_sum_scaled_filtered <- Spiked_16S_sum_scaled[, colData(Spiked_16S_sum_scaled)$sample.or.blank != "blank"] # Calculate spike-in retrieval percentage Perc <- calculate_spike_percentage( Spiked_16S_sum_scaled_filtered, merged_spiked_species, #output_file = "output_tse.docx", passed_range = c(0.1, 20) ) head(Perc) ## ----fig.align='center', fig.width=7, fig.height=5, out.width='50%', dpi=120---- # The acceptable range of spiked species retrieval is system-dependent # Spiked species become centroid of the community (Distance to Centroid) # Spiked species become dominant and imbalance the community (Evenness) # What range of spiked species retrieval is appropriate for your system? # Calculate Pielou's Evenness using Shannon index and species richness (Observed) # Hill number q = 1 = exp(Shannon index), representing the effective number of equally abundant species. Hill is weighted by relative abundance, so dominant species have more influence. # Unlike Pielou's evenness, this metric is not normalized by richness and it shows Effective number of common species library(TreeSummarizedExperiment) library(SummarizedExperiment) library(dplyr) library(tibble) library(microbiome) library(mia) library(vegan) library(S4Vectors) library(ggplot2) # --- 1. Extract current metadata from TSE --- metadata <- colData(Spiked_16S_sum_scaled) %>% as.data.frame() %>% rownames_to_column("Sample") # --- 2. Add spike-in reads (Perc) --- # Ensure Perc has 'Sample' column and matching format metadata <- dplyr::left_join(metadata, Perc, by = "Sample") # --- 3. Estimate alpha diversity indices and extract --- Spiked_16S_sum_scaled <- mia::addAlpha( Spiked_16S_sum_scaled, index = c("observed", "shannon", "pielou", "hill") ) alpha_df <- colData(Spiked_16S_sum_scaled) %>% as.data.frame() %>% rownames_to_column("Sample") %>% select(Sample, observed, shannon, pielou, hill) metadata <- dplyr::left_join(metadata, alpha_df, by = "Sample") # --- 4. Compute Bray-Curtis distance to centroid --- otu_mat <- assay(Spiked_16S_sum_scaled, "counts") otu_mat <- t(otu_mat) # samples as rows otu_mat_rel <- vegan::decostand(otu_mat, method = "total") centroid_profile <- colMeans(otu_mat_rel) dist_to_centroid <- apply(otu_mat_rel, 1, function(x) { vegan::vegdist(rbind(x, centroid_profile), method = "bray")[1] }) # Match and assign distances metadata$Dist_to_Centroid <- dist_to_centroid[metadata$Sample] # --- 5. Assign updated metadata to TSE --- metadata <- column_to_rownames(metadata, var = "Sample") metadata <- metadata[colnames(Spiked_16S_sum_scaled), , drop = FALSE] # ensure correct order and size colData(Spiked_16S_sum_scaled) <- S4Vectors::DataFrame(metadata) # 4. Regression Plots: Diversity vs. Spike-in Reads # ============================================================================ # Pielous Evenness plot_object_pielou <- regression_plot( data = metadata, x_var = "pielou", y_var = "Spiked_Reads", custom_range = c(0.1, 20, 30, 40, 50, 60, 100), plot_title = "Pielous Evenness vs. Spike-in Reads" ) # Hill Number (q = 1) plot_object_hill <- regression_plot( data = metadata, x_var = "hill", y_var = "Spiked_Reads", custom_range = c(0.1, 10, 20, 30, 100), plot_title = "Hill Number (q = 1) vs. Spike-in Reads" ) plot_object_hill # Distance to Centroid plot_object_centroid <- regression_plot( data = metadata, x_var = "Dist_to_Centroid", y_var = "Spiked_Reads", custom_range = c(0.1, 20, 30, 40, 50, 60, 100), plot_title = "Distance to Global Centroid vs. Spike-in Reads" ) # Interpretation # ============================================================================ # - Pielou's evenness is normalized by richness; useful for detecting imbalance. # - Hill number q = 1 gives effective number of common species; sensitive to dominance. # - Distance to centroid in full Bray-Curtis space shows deviation from the average community. ## ----eval=FALSE--------------------------------------------------------------- # # # PREREQUISITE FOR ITS & CALCULATE SPIKED % # # # # Define spiked species and related parameters** # # # # Define the spiked species # # spiked_cells <- 733 # # species_name <- spiked_species <- merged_spiked_species <- "Dekkera_bruxellensis" # # # Subset taxa for spiked species # # Dekkera <- phyloseq::subset_taxa( # # physeq_ITSOTU, # # Species %in% species_name) # # # hashcodes <- row.names(phyloseq::tax_table(Dekkera)) # # # Subset samples based on spiked volume # # physeq_ITSOTU_spiked <- phyloseq::subset_samples(physeq_ITSOTU, spiked.volume %in% c("2", "1")) # # # if TSE format # # tse_ITSOTU <- convert_phyloseq_to_tse(physeq_ITSOTU) # # physeq_ITSOTU_spiked <- tse_ITSOTU[, tse_ITSOTU$spiked.volume %in% c("2", "1")] ## ----------------------------------------------------------------------------- # CALCULATE SCALING FACTORS result <- calculate_spikeIn_factors( Spiked_16S_sum_scaled_filtered, spiked_cells = spiked_cells, merged_spiked_species = species_name) # View extracted outputs result$spiked_species_reads # Merged spiked species name result$total_reads # Total reads detected for the spike scaling_factors <- result$scaling_factors head(scaling_factors) # Vector of scaling factors per sample ## ----------------------------------------------------------------------------- # Convert relative counts to absolute counts # absolute counts=relative countssample-specific scaling factor # Convert to absolute counts library(DspikeIn) absolute <- convert_to_absolute_counts(Spiked_16S_sum_scaled_filtered, scaling_factors) # Extract processed data absolute_counts <- absolute$absolute_counts tse_absolute <- absolute$obj_adj tse_absolute <- tidy_phyloseq_tse(tse_absolute) # View absolute count data head(tse_absolute) ## ----------------------------------------------------------------------------- # CALCULATE SPIKE PERCENTAGE & summary stat # ** Calculate spike percentage & Generate summary statistics for absolute counts** # absolute_counts is a data.frame of OTU/ASV table # Generate summary statistics for absolute counts post_eval_summary <- calculate_summary_stats_table(absolute_counts) # You may want to Back normal to check calculation accuracy # the scaling factor was computed based on spiked species reads and fixed cell count. # Multiplying the spiked species read count by this scaling factor restores the exact spiked cell count. # lets check it # BackNormal <- calculate_spike_percentage( # tse_absolute, # merged_spiked_species, # passed_range = c(0.1, 20)) ## ----------------------------------------------------------------------------- # * Calculate the percentage of spiked species retrieval per sample* library(mia) library(dplyr) library(SummarizedExperiment) # Subset TSE to remove blanks tse_absolute_filtered <- tse_absolute[, colData(tse_absolute)$sample.or.blank != "blank"] # Generate conclusion report conc <- conclusion( tse_absolute_filtered, merged_spiked_species, max_passed_range = 20, output_path = output_path) conc$full_report # Filter to keep only the samples that passed passed_samples <- Perc$Sample[Perc$Result == "passed"] # Subset the TSE object to include only passed samples tse_passed <- tse_absolute_filtered[, colnames(tse_absolute_filtered) %in% passed_samples] dim(tse_passed) ## ----fig.align='center', fig.width=7, fig.height=5, out.width='70%', dpi=120---- pps_Abs <- DspikeIn::get_long_format_data(tse_passed) # calculation for relative abundance needs sum of total reads # total_reads <- sum(pps_Abs$Abundance) # Generate an alluvial plot alluvial_plot_abs <- alluvial_plot( data = pps_Abs, axes = c("Animal.type", "Ecoregion.III","Diet", "Animal.ecomode"), abundance_threshold = 5000, fill_variable = "Class", silent = TRUE, abundance_type = "absolute", top_taxa = 10, text_size = 3, legend_ncol = 1, custom_colors = DspikeIn::color_palette$MG_Awesome # Use the color palette from DspikeIn ) ## ----------------------------------------------------------------------------- # you may need to normalize/transform your data to reduce biases ps <- tse_passed # TC Normalization result_TC <- normalization_set(ps, method = "TC", groups = "Host.species") normalized_ps_TC <- result_TC$dat.normed scaling_factors_TC <- result_TC$scaling.factor # UQ Normalization # result_UQ <- normalization_set(ps, method = "UQ", groups = "Host.species") # normalized_ps_UQ <- result_UQ$dat.normed # scaling_factors_UQ <- result_UQ$scaling.factor # # # Median Normalization # result_med <- normalization_set(ps, method = "med", groups = "Host.species") # normalized_ps_med <- result_med$dat.normed # scaling_factors_med <- result_med$scaling.factor # DESeq Normalization # ps_n <- remove_zero_negative_count_samples(ps) # result_DESeq <- normalization_set(ps_n, method = "DESeq", groups = "Animal.type") # normalized_ps_DESeq <- result_DESeq$dat.normed # scaling_factors_DESeq <- result_DESeq$scaling.factor # Poisson Normalization # result_Poisson <- normalization_set(ps, method = "Poisson", groups = "Host.genus") # normalized_ps_Poisson <- result_Poisson$dat.normed # scaling_factors_Poisson <- result_Poisson$scaling.factor # Quantile Normalization # result_QN <- normalization_set(ps, method = "QN") # normalized_ps_QN <- result_QN$dat.normed # scaling_factors_QN <- result_QN$scaling.factor # TMM Normalization # result_TMM <- normalization_set(ps, method = "TMM", groups = "Animal.type") # normalized_ps_TMM <- result_TMM$dat.normed # scaling_factors_TMM <- result_TMM$scaling.factor # CLR Normalization # result_clr <- normalization_set(ps, method = "clr") # normalized_ps_clr <- result_clr$dat.normed # scaling_factors_clr <- result_clr$scaling.factor # Rarefying # result_rar <- normalization_set(ps, method = "rar") # normalized_ps_rar <- result_rar$dat.normed # scaling_factors_rar <- result_rar$scaling.factor # CSS Normalization # result_css <- normalization_set(ps, method = "css") # normalized_ps_css <- result_css$dat.normed # scaling_factors_css <- result_css$scaling.factor # # TSS Normalization # result_tss <- normalization_set(ps, method = "tss") # normalized_ps_tss <- result_tss$dat.normed # scaling_factors_tss <- result_tss$scaling.factor # RLE Normalization # result_rle <- normalization_set(ps, method = "rle") # normalized_ps_rle <- result_rle$dat.normed # scaling_factors_rle <- result_rle$scaling.factor ## ----fig.align='center', fig.width=7, fig.height=5, out.width='50%', dpi=120---- normalized_tse_TC<-convert_phyloseq_to_tse(normalized_ps_TC) rf_tse <- RandomForest_selected( normalized_ps_TC, response_var = "Host.genus", na_vars = c("Habitat", "Ecoregion.III", "Host.genus", "Diet") ) ridge_tse<- ridge_plot_it(rf_tse, taxrank = "Family", top_n = 10) + scale_fill_manual(values = DspikeIn::color_palette$cool_MG) ridge_tse ## ----fig.align='center', fig.width=11, fig.height=7, out.width='60%', dpi=120---- library(mia) library(dplyr) library(SummarizedExperiment) # ---Remove unwanted taxa by Genus, Family, or Order --- tse_filtered <- tse_absolute[ rowData(tse_absolute)$Genus != "Tetragenococcus" & rowData(tse_absolute)$Family != "Chloroplast" & rowData(tse_absolute)$Order != "Chloroplast", ] tse_caudate <- tse_filtered[, colData(tse_filtered)$Clade.Order == "Caudate"] genera_keep <- c("Desmognathus", "Plethodon", "Eurycea") tse_three_genera <- tse_caudate[, colData(tse_caudate)$Host.genus %in% genera_keep] tse_blue_ridge <- tse_three_genera[, colData(tse_three_genera)$Ecoregion.III == "Blue Ridge"] tse_desmog <- tse_blue_ridge[, colData(tse_blue_ridge)$Host.genus == "Desmognathus"] # --- Differential Abundance with DESeq2 --- results_DESeq2 <- perform_and_visualize_DA( obj = tse_desmog, method = "DESeq2", group_var = "Host.taxon", contrast = c("Desmognathus monticola", "Desmognathus imitator"), output_csv_path = "DA_DESeq2.csv", target_glom = "Genus", significance_level = 0.01) results_DESeq2$results results_DESeq2$obj_significant # Optional: Visualization results_DESeq2$bar_plot ## ----eval=FALSE--------------------------------------------------------------- # # # # DspikeIn: Differential Abundance Examples # # Using perform_and_visualize_DA() with multiple contrast scenarios # # # 1. Single Contrast # # res_single <- perform_and_visualize_DA( # obj = tse_absolute, # method = "DESeq2", # group_var = "Diet", # contrast = c("Insectivore", "Carnivore"), # target_glom = "Genus") # # # # # 2. Single Factor All Pairwise Contrasts # col_df <- as.data.frame(colData(tse_absolute)) # col_df$Host.taxon <- factor(make.names(as.character(col_df$Host.taxon))) # colData(tse_absolute)$Host.taxon <- col_df$Host.taxon # # # Get unique DESeq2-compatible factor levels # host_levels <- levels(col_df$Host.taxon) # print(host_levels) # # # contrast list # contrast_named <- list( # Host.taxon = combn(host_levels, 2, simplify = FALSE)) # # # multiple pairwise contrasts # res_multi <- perform_and_visualize_DA( # obj = tse_absolute, # method = "DESeq2", # group_var = "Host.taxon", # contrast = contrast_named, # target_glom = "Genus") # # # # # 3. Single Factor Selected Contrasts # # contrast_list <- list( # c("Insectivore", "Carnivore"), # c("Omnivore", "Herbivore")) # # res_selected <- perform_and_visualize_DA( # obj = tse_absolute, # method = "DESeq2", # group_var = "Diet", # contrast = contrast_list, # global_fdr = TRUE) # # # # # 4. Multiple Factors Selected Contrasts # # contrast_named <- list( # Diet = list( # c("Insectivore", "Carnivore"), # c("Omnivore", "Carnivore") ), # Animal.type = list( # c("Frog", "Salamander") )) # # res_multi_factor <- perform_and_visualize_DA( # obj = tse_absolute, # method = "DESeq2", # contrast = contrast_named, # target_glom = "Genus", # significance_level = 0.01, # global_fdr = TRUE) # # # # # 5. Multiple Factors all pairwise contrasts # # colData(tse_absolute)$Host.taxon <- droplevels(factor(colData(tse_absolute)$Host.taxon)) # colData(tse_absolute)$Habitat <- droplevels(factor(colData(tse_absolute)$Habitat)) # host_levels <- levels(colData(tse_absolute)$Host.taxon) # habitat_levels <- levels(colData(tse_absolute)$Habitat) # # # Define pairwise contrasts as a named list # contrast_named <- list( # Host.taxon = combn(host_levels, 2, simplify = FALSE), # Habitat = combn(habitat_levels, 2, simplify = FALSE)) # # # Define ComboGroup variable # colData(tse_absolute)$ComboGroup <- factor(interaction( # colData(tse_absolute)$Animal.type, # colData(tse_absolute)$Diet, # drop = TRUE)) # # # # # --- Run multi-factor DESeq2 comparisons --- # results_multi_factor <- perform_and_visualize_DA( # obj = tse_absolute, # method = "DESeq2", # contrast = contrast_named, # target_glom = "Genus", # significance_level = 0.01) # # # --- Combined Group Interactions --- # colData(tse_absolute)$ComboGroup <- factor(interaction( # colData(tse_absolute)$Animal.type, # colData(tse_absolute)$Diet, # drop = TRUE)) # # # --- Define custom interaction contrasts --- # contrast_list <- list( # c("Salamander.Insectivore", "Lizard.Insectivore"), # c("Salamander.Carnivore", "Snake.Carnivore"), # c("Salamander.Carnivore", "Frog.Carnivore")) # # # --- Run combined group comparisons --- # res_combo <- perform_and_visualize_DA( # obj = tse_absolute, # method = "DESeq2", # group_var = "ComboGroup", # contrast = contrast_list, # target_glom = "Genus", # global_fdr = TRUE) # ## ----------------------------------------------------------------------------- # -------------------------------------------- # Extract OTU matrices from TSE objects # -------------------------------------------- # relative dataset -> tse_desmog_rel # tse_16SOTU_spiked_samples <- tse_16SOTU[, colData(tse_16SOTU)$spiked.volume %in% c("2", "1") ] tse_filtered_rel <- tse_16SOTU_spiked_samples[ rowData(tse_16SOTU_spiked_samples)$Genus != "Tetragenococcus" & rowData(tse_16SOTU_spiked_samples)$Family != "Chloroplast" & rowData(tse_16SOTU_spiked_samples)$Order != "Chloroplast", ] tse_caudate_rel <- tse_filtered_rel[, colData(tse_filtered_rel)$Clade.Order == "Caudate"] genera_keep <- c("Desmognathus", "Plethodon", "Eurycea") tse_three_genera_rel <- tse_caudate[, colData(tse_caudate_rel)$Host.genus %in% genera_keep] tse_blue_ridge_rel <- tse_three_genera_rel[, colData(tse_three_genera_rel)$Ecoregion.III == "Blue Ridge"] tse_desmog_rel <- tse_blue_ridge_rel[, colData(tse_blue_ridge_rel)$Host.genus == "Desmognathus"] # absolute dataset -> tse_desmog otu_rel <- assay(tse_desmog_rel, "counts") otu_abs <- assay(tse_desmog, "counts") # Make sure samples are rows and taxa are columns otu_rel <- t(otu_rel) otu_abs <- t(otu_abs) # Ensure they are matrices otu_rel <- as.matrix(otu_rel) otu_abs <- as.matrix(otu_abs) # -------------------------------------------- # 2. Convert to Presence/Absence # -------------------------------------------- otu_rel_pa <- (otu_rel > 0) * 1 otu_abs_pa <- (otu_abs > 0) * 1 # -------------------------------------------- # 3. Identify common samples & taxa # -------------------------------------------- shared_samples <- intersect(rownames(otu_rel_pa), rownames(otu_abs_pa)) shared_taxa <- intersect(colnames(otu_rel_pa), colnames(otu_abs_pa)) # Subset to common set otu_rel_pa <- otu_rel_pa[shared_samples, shared_taxa] otu_abs_pa <- otu_abs_pa[shared_samples, shared_taxa] # -------------------------------------------- # 4. Compare presence/absence profiles # -------------------------------------------- shared_pa <- (otu_rel_pa + otu_abs_pa) == 2 total_present <- (otu_rel_pa + otu_abs_pa) >= 1 # Calculate percent agreement (global) percent_shared <- sum(shared_pa) / sum(total_present) cat("Percent of shared taxa detections (AA vs RA):", round(100 * percent_shared, 1), "%\n") ## ----fig.align='center', fig.width=8, fig.height=5, out.width='50%', dpi=120---- # Visualization of community composition # Subset rows where Genus is not Tetragenococcus and not NA keep_taxa <- !is.na(rowData(tse_absolute)$Genus) & rowData(tse_absolute)$Genus != "Tetragenococcus" # Subset the TSE by keeping only those taxa (rows) tse_filtered <- tse_absolute[keep_taxa, ] # Exclude blanks tse_filtered <- tse_filtered[, colData(tse_filtered)$sample.or.blank != "blank"] # subset Salamander samples tse_sal <- tse_filtered[, colData(tse_filtered)$Animal.type == "Salamander"] Prok_OTU_sal <- tidy_phyloseq_tse(tse_sal) TB<-taxa_barplot( Prok_OTU_sal, target_glom = "Family", custom_tax_names = NULL, normalize = TRUE, treatment_variable = "Habitat", abundance_type = "relative", x_angle = 15, fill_variable = "Family", facet_variable = "Diet", top_n_taxa = 10, palette = DspikeIn::color_palette$cool_MG, legend_size = 11, legend_columns = 1, x_scale = "free", xlab = NULL) ## ----fig.align='center', fig.width=11, fig.height=6, out.width='80%', dpi=120---- # 1. Initialization and loading NetWorks for Comparision # library(SpiecEasi) # library(ggnet) # library(igraph) library(DspikeIn) library(tidyr) library(dplyr) library(ggpubr) library(igraph) # herp.Bas is a merged phyloseq object for both bacterial and fungal domains # spiec.easi(herp.Bas, method='mb', lambda.min.ratio=1e-3, nlambda=250,pulsar.select=TRUE ) Complete <- DspikeIn::load_graphml("Complete.graphml") NoBasid <- DspikeIn::load_graphml("NoBasid.graphml") NoHubs <- DspikeIn::load_graphml("NoHubs.graphml") # degree_network -> please note that needs full path of your graphml result_kk <- DspikeIn::degree_network(load_graphml("Complete.graphml"), save_metrics = TRUE, layout_type = "stress") result_kk$metrics result <- DspikeIn::weight_Network(graph_path = "Complete.graphml") result$plot ## ----fig.align='center', fig.width=8, fig.height=6, out.width='70%', dpi=120---- # Load required libraries library(igraph) library(dplyr) library(tidyr) library(ggplot2) library(ggrepel) ggplot2::margin() # 2. Metrics Calculation completeMetrics <- DspikeIn::node_level_metrics(Complete) NoHubsMetrics <- DspikeIn::node_level_metrics(NoHubs) NoBasidMetrics <- DspikeIn::node_level_metrics(NoBasid) Complete_metrics <- completeMetrics$metrics Nohub_metrics <- NoHubsMetrics$metrics Nobasid_metrics <- NoBasidMetrics$metrics Complete_metrics <- data.frame(lapply(Complete_metrics, as.character), stringsAsFactors = FALSE) Nohub_metrics <- data.frame(lapply(Nohub_metrics, as.character), stringsAsFactors = FALSE) Nobasid_metrics <- data.frame(lapply(Nobasid_metrics, as.character), stringsAsFactors = FALSE) print(igraph::vcount(Complete)) # Number of nodes in Complete network print(igraph::ecount(Complete)) # Number of edges in Complete network print(igraph::vcount(NoBasid)) print(igraph::ecount(NoBasid)) print(igraph::vcount(NoHubs)) print(igraph::ecount(NoHubs)) metrics_scaled <- bind_rows( Complete_metrics %>% mutate(Network = "Complete"), Nohub_metrics %>% mutate(Network = "NoHubs"), Nobasid_metrics %>% mutate(Network = "NoBasid") ) %>% dplyr::mutate(dplyr::across(where(is.numeric), scale)) metrics_long_scaled <- metrics_scaled %>% tidyr::pivot_longer(cols = -c(Node, Network), names_to = "Metric", values_to = "Value") # Ensure each dataset has a "Network" column before combining completeMetrics$metrics <- completeMetrics$metrics %>% mutate(Network = "Complete Network") NoHubsMetrics$metrics <- NoHubsMetrics$metrics %>% mutate(Network = "Network & Module Hubs Removed") NoBasidMetrics$metrics <- NoBasidMetrics$metrics %>% mutate(Network = "Basidiobolus Subnetwork Removed") # Combine All Data combined_data <- bind_rows( completeMetrics$metrics, NoHubsMetrics$metrics, NoBasidMetrics$metrics ) # Add Node Identifier if missing if (!"Node" %in% colnames(combined_data)) { combined_data <- combined_data %>% mutate(Node = rownames(.)) } # Convert `Network` into Factor combined_data$Network <- factor(combined_data$Network, levels = c( "Complete Network", "Network & Module Hubs Removed", "Basidiobolus Subnetwork Removed")) # Convert Data to Long Format metrics_long <- combined_data %>% pivot_longer( cols = c("Redundancy", "Efficiency", "Betweenness"), names_to = "Metric", values_to = "Value") # Define Custom Colors and Shapes network_colors <- c( "Complete Network" = "#F1E0C5", "Network & Module Hubs Removed" = "#D2A5A1", "Basidiobolus Subnetwork Removed" = "#B2C3A8") network_shapes <- c( "Complete Network" = 21, "Network & Module Hubs Removed" = 22, "Basidiobolus Subnetwork Removed" = 23) # Determine Top 30% of Nodes to Label/Optional metrics_long <- metrics_long %>% group_by(Network, Metric) %>% mutate(Label = ifelse(rank(-Value, ties.method = "random") / n() <= 0.3, Node, NA)) # ?quadrant_plot() can creat plot for indivisual network plot4 <- quadrant_plot(completeMetrics$metrics, x_metric = "Among_Module_Connectivity", y_metric = "Degree") # Customized comparision Plots create_metric_plot <- function(metric_name, data, title) { data_filtered <- data %>% filter(Metric == metric_name) median_degree <- median(data_filtered$Degree, na.rm = TRUE) median_value <- median(data_filtered$Value, na.rm = TRUE) ggplot(data_filtered, aes(x = Degree, y = Value, fill = Network)) + geom_point(aes(shape = Network), size = 3, stroke = 1, color = "black") + geom_text_repel(aes(label = Label), size = 3, max.overlaps = 50) + scale_fill_manual(values = network_colors) + scale_shape_manual(values = network_shapes) + geom_vline(xintercept = median_degree, linetype = "dashed", color = "black", size = 1) + geom_hline(yintercept = median_value, linetype = "dashed", color = "black", size = 1) + labs( title = title, x = "Degree", y = metric_name, fill = "Network", shape = "Network" ) + theme_minimal() + theme( plot.title = element_text( hjust = 0.5, size = 16, face = "bold", margin = margin(t = 10, b = 20) # Moves the title downward ), axis.title = element_text(size = 14, face = "bold"), legend.position = "top", legend.title = element_text(size = 14, face = "bold"), legend.text = element_text(size = 12) ) } # Generate Plots plot_redundancy <- create_metric_plot("Redundancy", metrics_long, "Redundancy vs. Degree Across Networks") plot_efficiency <- create_metric_plot("Efficiency", metrics_long, "Efficiency vs. Degree Across Networks") plot_betweenness <- create_metric_plot("Betweenness", metrics_long, "Betweenness vs. Degree Across Networks") # Print Plots print(plot_betweenness) ## ----fig.align='center', fig.width=10, fig.height=8, out.width='70%', dpi=120---- # 2. Metrics Calculation result_Complete <- DspikeIn::node_level_metrics(Complete) result_NoHubs <- DspikeIn::node_level_metrics(NoHubs) result_NoBasid <- DspikeIn::node_level_metrics(NoBasid) Complete_metrics <- result_Complete$metrics Nohub_metrics <- result_NoHubs$metrics Nobasid_metrics <- result_NoBasid$metrics Complete_metrics <- data.frame(lapply(Complete_metrics, as.character), stringsAsFactors = FALSE) Nohub_metrics <- data.frame(lapply(Nohub_metrics, as.character), stringsAsFactors = FALSE) Nobasid_metrics <- data.frame(lapply(Nobasid_metrics, as.character), stringsAsFactors = FALSE) print(igraph::vcount(Complete)) # Number of nodes in Complete network print(igraph::ecount(Complete)) # Number of edges in Complete network print(igraph::vcount(NoBasid)) print(igraph::ecount(NoBasid)) print(igraph::vcount(NoHubs)) print(igraph::ecount(NoHubs)) metrics_scaled <- bind_rows( Complete_metrics %>% mutate(Network = "Complete"), Nohub_metrics %>% mutate(Network = "NoHubs"), Nobasid_metrics %>% mutate(Network = "NoBasid") ) %>% dplyr::mutate(dplyr::across(where(is.numeric), scale)) metrics_long_scaled <- metrics_scaled %>% tidyr::pivot_longer(cols = -c(Node, Network), names_to = "Metric", values_to = "Value") # Remove missing values metrics_long_scaled <- na.omit(metrics_long_scaled) # We visualize only six metrics selected_metrics <- c( "Degree", "Closeness", "Betweenness", "EigenvectorCentrality", "PageRank", "Transitivity" ) metrics_long_filtered <- metrics_long_scaled %>% filter(Metric %in% selected_metrics) %>% mutate( Value = as.numeric(as.character(Value)), Network = recode(Network, "Complete" = "Complete Network", "NoHubs" = "Network & Module Hubs Removed", "NoBasid" = "Basidiobolus Subnetwork Removed" ) ) %>% na.omit() # Remove any NA if any metrics_long_filtered$Network <- factor(metrics_long_filtered$Network, levels = c( "Complete Network", "Network & Module Hubs Removed", "Basidiobolus Subnetwork Removed" )) # DspikeIn::color_palette$light_MG network_colors <- c( "Complete Network" = "#F1E0C5", "Network & Module Hubs Removed" = "#D2A5A1", "Basidiobolus Subnetwork Removed" = "#B2C3A8") # statistical comparisons a vs b comparisons <- list( c("Complete Network", "Network & Module Hubs Removed"), c("Complete Network", "Basidiobolus Subnetwork Removed"), c("Network & Module Hubs Removed", "Basidiobolus Subnetwork Removed")) networks_in_data <- unique(metrics_long_filtered$Network) comparisons <- comparisons[sapply(comparisons, function(pair) all(pair %in% networks_in_data))] met <- ggplot(metrics_long_filtered, aes(x = Network, y = Value, fill = Network)) + geom_boxplot(outlier.shape = NA) + geom_jitter(aes(color = Network), position = position_jitter(0.2), alpha = 0.2, size = 1.5 ) + scale_fill_manual(values = network_colors) + scale_color_manual(values = network_colors) + facet_wrap(~Metric, scales = "free_y", labeller = label_wrap_gen(width = 20)) + ggpubr::stat_compare_means(method = "wilcox.test", label = "p.signif", comparisons = comparisons) + theme_minimal() + theme( axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(size = 10, angle = 10, hjust = 0.8), strip.text = element_text(size = 12, margin = margin(t = 15, b = 5)), legend.position = "top", legend.text = element_text(size = 13), legend.title = element_text(size = 13, face = "bold"), plot.title = element_text(size = 13, face = "bold") ) + labs(title = "Selected Node Metrics Across Networks", fill = "Network Type", color = "Network Type") met ## ----------------------------------------------------------------------------- Complete <- load_graphml("Complete.graphml") result2 <- DspikeIn::extract_neighbors( graph = Complete, target_node = "OTU69:Basidiobolus_sp", mode = "all") head(result2$summary) ## ----fig.align='center', fig.width=8, fig.height=5, out.width='60%', dpi=120---- # Options: `"stress"` (default), `"graphopt"`, `"fr"` # Compute degree metrics result <- degree_network(graph_path = Complete, save_metrics = TRUE) # Compute network weights for different graph structures NH <- weight_Network(graph_path = "NoHubs.graphml") NB <- weight_Network(graph_path = "NoBasid.graphml") C <- weight_Network(graph_path = "Complete.graphml") # Extract metrics from the computed network weights CompleteM <- C$metrics NoHubsM <- NH$metrics NoBasidM <- NB$metrics # Combine metrics into a single dataframe for comparison df <- bind_rows( CompleteM %>% mutate(Group = "Complete Network"), NoHubsM %>% mutate(Group = "Network & Module Hubs Removed"), NoBasidM %>% mutate(Group = "Basidiobolus Subnetwork Removed") ) %>% pivot_longer(cols = -Group, names_to = "Metric", values_to = "Value") # Aggregate total values by metric and group df_bar <- df %>% group_by(Metric, Group) %>% summarise(Total_Value = sum(Value, na.rm = TRUE), .groups = "drop") # Order metrics by mean value (optional) metric_order <- df_bar %>% group_by(Metric) %>% summarise(mean_value = mean(Total_Value)) %>% arrange(desc(mean_value)) %>% pull(Metric) df_bar$Metric <- factor(df_bar$Metric, levels = metric_order) # Define refined color palette network_colors <- c( "Complete Network" = "#F1E0C5", "Network & Module Hubs Removed" = "#D2A5A1", "Basidiobolus Subnetwork Removed" = "#B2C3A8" ) # Create elegant bar plot pg <- ggplot(df_bar, aes(x = Metric, y = log1p(Total_Value), fill = Group)) + geom_bar(stat = "identity", position = position_dodge(width = 0.8), width = 0.7, alpha = 0.9) + scale_fill_manual(values = network_colors) + theme_minimal(base_size = 14) + theme( panel.grid.major.x = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 11, angle = 15, hjust = 1, vjust = 1, color = "gray20"), axis.text.y = element_text(size = 11, color = "gray25"), axis.title = element_text(size = 13, face = "bold"), plot.title = element_text(size = 15, face = "bold", hjust = 0.5, margin = ggplot2::margin(b = 10)), legend.position = "top", legend.title = element_blank(), legend.text = element_text(size = 12), panel.background = element_rect(fill = "white", color = NA), plot.background = element_rect(fill = "white", color = NA) ) + labs( subtitle = "Comparison of total network-level metrics (log-scaled)", x = NULL, y = "Total Value (log1p)" ) + geom_text( aes(label = round(log1p(Total_Value), 1)), position = position_dodge(width = 0.8), vjust = -0.3, size = 3.5, color = "gray25" ) ## ----eval=FALSE--------------------------------------------------------------- # library(igraph) # library(tidygraph) # library(ggraph) # library(DspikeIn) # # # # AA abundance # Complete<-load_graphml("Complete.graphml") # # deg <- degree(Complete) # # hist(deg, breaks = 30, main = "Degree Distribution", xlab = "Degree") # fit <- fit_power_law(deg + 1) # Avoid zero degrees # print(fit) # # # # ---- empirical network ---- # g_empirical = Complete # #g_empirical = Complete_Rel # # # # # ---- Calculate real network metrics ---- # creal <- transitivity(g_empirical, type = "global") # E(g_empirical)$weight <- abs(E(g_empirical)$weight) # lreal <- mean_distance(g_empirical, directed = FALSE, unconnected = TRUE, weights = E(g_empirical)$weight) # # # ---- Generate 1000 Erds-Rnyi random graphs with same size ---- # set.seed(42) # for reproducibility # n_nodes <- vcount(g_empirical) # n_edges <- ecount(g_empirical) # # crand_vals <- numeric(1000) # lrand_vals <- numeric(1000) # # for (i in 1:1000) { # g_rand <- sample_gnm(n = n_nodes, m = n_edges, directed = FALSE) # if (!is_connected(g_rand)) next # # crand_vals[i] <- transitivity(g_rand, type = "global") # lrand_vals[i] <- mean_distance(g_rand, directed = FALSE, unconnected = TRUE) # } # # # ---- Calculate mean values across random graphs ---- # crand <- mean(crand_vals, na.rm = TRUE) # lrand <- mean(lrand_vals, na.rm = TRUE) # # # ---- Compute Small-World Index () ---- # sigma <- (creal / crand) / (lreal / lrand) # # # cat("Global clustering coefficient (real):", creal, "\n") # cat("Average path length (real):", lreal, "\n") # cat("Mean clustering coefficient (random):", crand, "\n") # cat("Mean path length (random):", lrand, "\n") # cat("Small-World Index ():", round(sigma, 2), "\n") # ## ----------------------------------------------------------------------------- sessionInfo() ## ----cleanup, include = FALSE------------------------------------------------- # ===================================================================== # FINAL CLEANUP: Remove files created during vignette execution # ===================================================================== # Note: This cleanup ensures no large or temporary files remain after vignette execution. # All deletions are conditional (safe) and comply with CRAN/Bioconductor policies. files_to_delete <- c( "Global_Network_Metrics.csv", "absolute_counts.csv", "Summary.csv", "post eval summary.csv", "abundance_analysis_filtered_phyloseq.rds", "abundance_analysis_filtered_tse.rds", "abundance_analysis_high_abundance_phyloseq.rds", "abundance_analysis_high_abundance_tse.rds", "abundance_analysis_low_abundance_phyloseq.rds", "abundance_analysis_low_abundance_tse.rds", "adjusted_prevalence.rds", "core_microbiome.csv", "core_microbiome.rds", "comp_spikein_patristic_distances.csv", "comp_spikein_branch_lengths.pdf", "comp_spikein_patristic_distances_hist.pdf", "merged_data.csv", "merged_data.docx", "comp_spikein.nwk", "comp_spikein_tree.png", "comp_spikein_patristic_distances_hist.pdf", "post_eval_summary.csv", "post_eval_summary.docx", "merged_physeq_max_processed.rds", "merged_physeq_sum.rds", "merged_physeq_sum_processed.rds", "spikeIn_factors_output", "plot_Spike.percentage_Kruskal.pdf", "plot_Spike.percentage_Kruskal.png", "plot_Spike.reads_Kruskal.pdf", "plot_Spike.reads_Kruskal.png", "plot_Total.reads_Kruskal.pdf", "plot_Total.reads_Kruskal.png", "proportion_adjusted_physeq.rds", "proportion_adjusted_tse.rds", "randomsubsample_Trimmed_evenDepth.rds", "summary.csv", "summary.docx", "DspikeIn-manual.tex", "output.csv", "output.docx", "tree_with_bootstrap_and_cophenetic.png" ) # Safely remove existing files if (any(file.exists(files_to_delete))) { invisible(file.remove(files_to_delete[file.exists(files_to_delete)])) } # Safely remove known temporary folders folders_to_delete <- c("spikeIn_factors_output") for (folder in folders_to_delete) { if (dir.exists(folder)) unlink(folder, recursive = TRUE, force = TRUE) }