## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 10,
  fig.height = 7,
  dpi = 300,
  out.width = "100%"
)

## ----setup, message = FALSE---------------------------------------------------
library(evanverse)
library(dplyr)
library(ggplot2)
library(tidyr)

## ----gene-conversion-demo, eval = FALSE---------------------------------------
# # Example gene symbols commonly used in cancer research
# cancer_genes <- c("BRCA1", "BRCA2", "TP53", "EGFR", "MYC", "RAS", "PIK3CA", "AKT1")
# 
# # Convert gene symbols to Ensembl IDs
# ensembl_ids <- convert_gene_id(
#   genes = cancer_genes,
#   from = "symbol",
#   to = "ensembl",
#   species = "human"
# )
# 
# # Display conversion results
# conversion_table <- data.frame(
#   Gene_Symbol = cancer_genes,
#   Ensembl_ID = ensembl_ids
# )
# 
# print(conversion_table)

## ----gene-conversion-mock-----------------------------------------------------
# Mock example for demonstration (since biomaRt requires internet)
cancer_genes <- c("BRCA1", "BRCA2", "TP53", "EGFR", "MYC", "KRAS", "PIK3CA", "AKT1")

# Simulated conversion results
mock_conversion <- data.frame(
  Gene_Symbol = cancer_genes,
  Ensembl_ID = c(
    "ENSG00000012048", "ENSG00000139618", "ENSG00000141510",
    "ENSG00000146648", "ENSG00000136997", "ENSG00000133703",
    "ENSG00000171608", "ENSG00000142208"
  ),
  Entrez_ID = c(672, 675, 7157, 1956, 4609, 3845, 5290, 207),
  stringsAsFactors = FALSE
)

cat("🧬 Gene ID Conversion Example\n")
cat("=============================\n")
print(mock_conversion)

cat("\n📊 Conversion Summary:\n")
cat("  • Input genes:", length(cancer_genes), "\n")
cat("  • Successful conversions:", nrow(mock_conversion), "\n")
cat("  • Success rate:", round(100 * nrow(mock_conversion) / length(cancer_genes), 1), "%\n")

## ----conversion-workflow------------------------------------------------------
# Simulate a real-world scenario with mixed identifier types
mixed_identifiers <- c(
  "BRCA1", "ENSG00000139618", "7157", "EGFR",
  "ENSG00000136997", "3845", "PIK3CA", "207"
)

# Function to detect identifier type
detect_id_type <- function(ids) {
  sapply(ids, function(id) {
    if (grepl("^ENSG", id)) return("ensembl")
    if (grepl("^[0-9]+$", id)) return("entrez")
    return("symbol")
  })
}

id_types <- detect_id_type(mixed_identifiers)
cat("🔍 Identifier Type Detection:\n")
print(data.frame(
  Identifier = mixed_identifiers,
  Detected_Type = id_types
))

# Group by identifier type for batch conversion
id_groups <- split(mixed_identifiers, id_types)
cat("\n📦 Grouped Identifiers for Conversion:\n")
str(id_groups)

## ----gmt-processing-demo, eval = FALSE----------------------------------------
# # Example: Process a pathway GMT file
# # pathway_df <- gmt2df("path/to/c2.cp.kegg.v7.4.symbols.gmt")
# # pathway_list <- gmt2list("path/to/c2.cp.kegg.v7.4.symbols.gmt")
# 
# # Display structure
# # head(pathway_df, 10)
# # length(pathway_list)

## ----gmt-mock-demo------------------------------------------------------------
# Create mock GMT data to demonstrate structure
mock_pathways <- list(
  "KEGG_GLYCOLYSIS_GLUCONEOGENESIS" = c(
    "HK1", "HK2", "GPI", "PFKL", "ALDOA", "TPI1", "GAPDH",
    "PGK1", "PGAM1", "ENO1", "PKM", "LDHA", "PDK1"
  ),
  "KEGG_CITRATE_CYCLE" = c(
    "CS", "ACO1", "IDH1", "OGDH", "SUCLA2", "SDHA",
    "FH", "MDH1", "PCK1", "PDK1", "DLAT"
  ),
  "KEGG_FATTY_ACID_SYNTHESIS" = c(
    "ACACA", "FASN", "ACLY", "ACC2", "ELOVL6", "SCD",
    "FADS1", "FADS2", "ACSL1", "GPAM"
  ),
  "KEGG_DNA_REPAIR" = c(
    "BRCA1", "BRCA2", "TP53", "ATM", "CHEK1", "CHEK2",
    "RAD51", "XRCC1", "PARP1", "MSH2", "MLH1"
  )
)

# Convert list to data frame format (simulating gmt2df output)
mock_gmt_df <- do.call(rbind, lapply(names(mock_pathways), function(pathway) {
  data.frame(
    pathway = pathway,
    gene = mock_pathways[[pathway]],
    stringsAsFactors = FALSE
  )
}))

cat("📋 GMT File Processing Results\n")
cat("==============================\n")
cat("Number of pathways:", length(mock_pathways), "\n")
cat("Total gene-pathway associations:", nrow(mock_gmt_df), "\n")
cat("Average genes per pathway:", round(mean(lengths(mock_pathways)), 1), "\n\n")

cat("Sample pathway data frame:\n")
print(head(mock_gmt_df, 12))

# Pathway size distribution
pathway_sizes <- lengths(mock_pathways)
cat("\n📊 Pathway Size Distribution:\n")
print(data.frame(
  Pathway = names(pathway_sizes),
  Gene_Count = pathway_sizes
))

## ----gene-set-overlap, fig.cap="Gene set overlap analysis showing relationships between biological pathways"----
# Analyze overlaps between pathways
pathway_genes <- mock_pathways[1:3]  # Use first 3 pathways for Venn diagram

# Create Venn diagram for pathway overlaps
venn_plot <- plot_venn(
  set1 = pathway_genes[[1]],
  set2 = pathway_genes[[2]],
  set3 = pathway_genes[[3]],
  category.names = names(pathway_genes),
  fill = get_palette("vividset", type = "qualitative", n = 3),
  title = "Metabolic Pathway Gene Overlaps"
)

print(venn_plot)

# Calculate detailed overlap statistics
all_genes <- unique(unlist(pathway_genes))
cat("\n🔍 Detailed Overlap Analysis:\n")
cat("===============================\n")
cat("Total unique genes across pathways:", length(all_genes), "\n")

# Pairwise overlaps
pathway_names <- names(pathway_genes)
for (i in 1:(length(pathway_names) - 1)) {
  for (j in (i + 1):length(pathway_names)) {
    overlap <- length(intersect(pathway_genes[[i]], pathway_genes[[j]]))
    cat(sprintf("%s ∩ %s: %d genes\n",
                gsub("KEGG_", "", pathway_names[i]),
                gsub("KEGG_", "", pathway_names[j]),
                overlap))
  }
}

## ----rnaseq-workflow, fig.cap="Differential expression analysis visualization with volcano plot"----
# Simulate RNA-seq differential expression results
set.seed(123)
n_genes <- 2000

# Simulate log fold changes and p-values
gene_results <- data.frame(
  Gene = paste0("Gene_", 1:n_genes),
  LogFC = rnorm(n_genes, mean = 0, sd = 1.2),
  PValue = rbeta(n_genes, shape1 = 1, shape2 = 10),
  stringsAsFactors = FALSE
)

# Add some significant genes
significant_indices <- sample(1:n_genes, 200)
gene_results$LogFC[significant_indices] <- gene_results$LogFC[significant_indices] +
  sample(c(-2, 2), 200, replace = TRUE)
gene_results$PValue[significant_indices] <- gene_results$PValue[significant_indices] * 0.01

# Calculate adjusted p-values
gene_results$FDR <- p.adjust(gene_results$PValue, method = "BH")

# Classify genes
gene_results$Regulation <- "Not Significant"
gene_results$Regulation[gene_results$FDR < 0.05 & gene_results$LogFC > 1] <- "Up-regulated"
gene_results$Regulation[gene_results$FDR < 0.05 & gene_results$LogFC < -1] <- "Down-regulated"

# Create volcano plot
volcano_colors <- c(
  "Up-regulated" = get_palette("vividset", type = "qualitative", n = 3)[1],
  "Down-regulated" = get_palette("vividset", type = "qualitative", n = 3)[2],
  "Not Significant" = "#CCCCCC"
)

p1 <- ggplot(gene_results, aes(x = LogFC, y = -log10(FDR), color = Regulation)) +
  geom_point(alpha = 0.6, size = 1.2) +
  scale_color_manual(values = volcano_colors) +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "#666666") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "#666666") +
  labs(
    title = "Differential Gene Expression Analysis",
    subtitle = "Volcano plot showing treatment vs. control comparison",
    x = "Log₂ Fold Change",
    y = "-log₁₀(FDR-adjusted p-value)",
    color = "Regulation"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
    plot.subtitle = element_text(size = 11, color = "#666666"),
    legend.position = "bottom"
  )

print(p1)

# Summary statistics
regulation_summary <- table(gene_results$Regulation)
cat("\n📊 Differential Expression Summary:\n")
cat("===================================\n")
print(regulation_summary)

cat("\nTop 10 up-regulated genes (by fold change):\n")
top_up <- gene_results[gene_results$Regulation == "Up-regulated", ] %>%
  arrange(desc(LogFC)) %>%
  head(10)
print(top_up[, c("Gene", "LogFC", "FDR")])

## ----pathway-enrichment, fig.cap="Pathway enrichment analysis showing biological processes affected by treatment"----
# Simulate pathway enrichment analysis results
enrichment_results <- data.frame(
  Pathway = c(
    "Cell Cycle", "Apoptosis", "DNA Repair", "Inflammation",
    "Metabolism", "Signaling", "Transport", "Development"
  ),
  GeneRatio = c(0.15, 0.22, 0.18, 0.31, 0.09, 0.25, 0.12, 0.08),
  FDR = c(0.001, 0.003, 0.008, 0.0001, 0.045, 0.002, 0.021, 0.089),
  GeneCount = c(23, 34, 28, 48, 14, 39, 18, 12),
  stringsAsFactors = FALSE
)

# Calculate enrichment score
enrichment_results$EnrichmentScore <- -log10(enrichment_results$FDR)

# Create enrichment plot
p2 <- ggplot(enrichment_results, aes(x = GeneRatio, y = reorder(Pathway, EnrichmentScore))) +
  geom_point(aes(color = EnrichmentScore, size = GeneCount), alpha = 0.8) +
  scale_color_gradientn(
    colors = get_palette("warm_blush", type = "sequential", n = 4),
    name = "-log₁₀(FDR)"
  ) +
  scale_size_continuous(name = "Gene Count", range = c(3, 12)) +
  geom_vline(xintercept = 0.1, linetype = "dashed", color = "#666666", alpha = 0.7) +
  labs(
    title = "Pathway Enrichment Analysis",
    subtitle = "Biological processes enriched in differentially expressed genes",
    x = "Gene Ratio (enriched genes / pathway total)",
    y = "Biological Pathway"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
    plot.subtitle = element_text(size = 11, color = "#666666"),
    panel.grid.major.y = element_blank(),
    legend.position = "right"
  )

print(p2)

cat("\n🎯 Pathway Enrichment Summary:\n")
cat("==============================\n")
significant_pathways <- enrichment_results[enrichment_results$FDR < 0.05, ]
cat("Significant pathways (FDR < 0.05):", nrow(significant_pathways), "\n")
cat("Most enriched pathway:", significant_pathways$Pathway[which.max(significant_pathways$EnrichmentScore)], "\n")
cat("Total genes in significant pathways:", sum(significant_pathways$GeneCount), "\n")

## ----multiomics-integration, fig.cap="Multi-omics data integration showing genomic variants and expression changes"----
# Simulate multi-omics data integration
set.seed(456)
selected_genes <- c("BRCA1", "TP53", "EGFR", "MYC", "KRAS", "PIK3CA", "AKT1", "PTEN")

# Create integrated omics data
omics_data <- data.frame(
  Gene = rep(selected_genes, each = 3),
  DataType = rep(c("Mutation", "CNV", "Expression"), length(selected_genes)),
  Value = c(
    # Mutation frequencies (0-1)
    c(0.12, 0.34, 0.08, 0.15, 0.22, 0.09, 0.06, 0.18),
    # Copy number variations (-2 to 2)
    c(-0.5, -1.2, 1.8, 0.3, 0.8, -0.8, 1.1, -1.5),
    # Expression fold changes (-3 to 3)
    c(-1.5, -2.8, 2.1, 1.8, -1.2, 2.3, -0.8, -2.1)
  ),
  Patient_Group = rep(c("Group_A", "Group_B", "Group_C"), length(selected_genes))
)

# Normalize values for visualization
omics_data$Normalized_Value <- ave(omics_data$Value, omics_data$DataType,
                                   FUN = function(x) scale(x)[,1])

# Create heatmap
p3 <- ggplot(omics_data, aes(x = DataType, y = Gene, fill = Normalized_Value)) +
  geom_tile(color = "white", size = 0.5) +
  scale_fill_gradientn(
    colors = get_palette("gradient_rd_bu", type = "diverging", n = 11),
    name = "Z-score",
    limits = c(-2, 2),
    breaks = c(-2, -1, 0, 1, 2)
  ) +
  labs(
    title = "Multi-omics Cancer Gene Analysis",
    subtitle = "Integrated view of mutations, copy number, and expression",
    x = "Data Type",
    y = "Cancer-related Genes"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
    plot.subtitle = element_text(size = 11, color = "#666666"),
    panel.grid = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

print(p3)

# Summary by data type
cat("\n🧬 Multi-omics Data Summary:\n")
cat("============================\n")
summary_stats <- omics_data %>%
  group_by(DataType) %>%
  summarise(
    Mean_Value = round(mean(Value), 3),
    SD_Value = round(sd(Value), 3),
    Min_Value = round(min(Value), 3),
    Max_Value = round(max(Value), 3),
    .groups = 'drop'
  )
print(summary_stats)

## ----survival-analysis, fig.cap="Forest plot showing hazard ratios for genetic markers in survival analysis"----
# Simulate survival analysis results
survival_data <- data.frame(
  Gene = c("BRCA1", "BRCA2", "TP53", "EGFR", "MYC", "KRAS", "PIK3CA", "AKT1"),
  HazardRatio = c(1.23, 0.87, 1.45, 1.12, 0.92, 1.67, 1.34, 0.78),
  CI_Lower = c(0.98, 0.71, 1.18, 0.89, 0.75, 1.32, 1.05, 0.61),
  CI_Upper = c(1.55, 1.07, 1.78, 1.41, 1.13, 2.11, 1.71, 0.99),
  PValue = c(0.067, 0.189, 0.001, 0.324, 0.445, 0.0001, 0.018, 0.041),
  stringsAsFactors = FALSE
)

# Add significance categories
survival_data$Significance <- ifelse(survival_data$PValue < 0.001, "***",
                            ifelse(survival_data$PValue < 0.01, "**",
                            ifelse(survival_data$PValue < 0.05, "*", "ns")))

# Create forest plot using evanverse plotting functions
p4 <- plot_forest(
  data = survival_data,
  label_col = "Gene",
  estimate_col = "HazardRatio",
  lower_col = "CI_Lower",
  upper_col = "CI_Upper",
  p_col = "PValue"
)

print(p4)

cat("\n🎯 Survival Analysis Summary:\n")
cat("=============================\n")
significant_genes <- survival_data[survival_data$PValue < 0.05, ]
cat("Significant prognostic markers:", nrow(significant_genes), "\n")
cat("Risk factors (HR > 1):", sum(significant_genes$HazardRatio > 1), "\n")
cat("Protective factors (HR < 1):", sum(significant_genes$HazardRatio < 1), "\n")

print(significant_genes[, c("Gene", "HazardRatio", "PValue", "Significance")])

## ----biomarker-discovery, fig.cap="Biomarker discovery showing gene expression patterns across clinical subtypes"----
# Simulate clinical biomarker data
set.seed(789)
n_patients <- 120
n_biomarkers <- 20

# Generate patient clinical data
clinical_data <- data.frame(
  Patient_ID = paste0("P", 1:n_patients),
  Subtype = sample(c("Luminal_A", "Luminal_B", "HER2+", "TNBC"), n_patients,
                   replace = TRUE, prob = c(0.4, 0.2, 0.15, 0.25)),
  Stage = sample(c("I", "II", "III", "IV"), n_patients,
                 replace = TRUE, prob = c(0.3, 0.35, 0.25, 0.1)),
  Age = round(rnorm(n_patients, 55, 12)),
  Survival_Months = round(rexp(n_patients, rate = 0.02)),
  stringsAsFactors = FALSE
)

# Generate biomarker expression data
biomarker_genes <- paste0("Biomarker_", 1:n_biomarkers)
expression_data <- matrix(rnorm(n_patients * n_biomarkers, mean = 5, sd = 2),
                         nrow = n_patients, ncol = n_biomarkers)
colnames(expression_data) <- biomarker_genes
rownames(expression_data) <- clinical_data$Patient_ID

# Add subtype-specific expression patterns
luminal_a_patients <- clinical_data$Patient_ID[clinical_data$Subtype == "Luminal_A"]
her2_patients <- clinical_data$Patient_ID[clinical_data$Subtype == "HER2+"]
tnbc_patients <- clinical_data$Patient_ID[clinical_data$Subtype == "TNBC"]

# Simulate subtype-specific biomarkers
expression_data[luminal_a_patients, "Biomarker_1"] <-
  expression_data[luminal_a_patients, "Biomarker_1"] + 3

expression_data[her2_patients, "Biomarker_5"] <-
  expression_data[her2_patients, "Biomarker_5"] + 4

expression_data[tnbc_patients, "Biomarker_12"] <-
  expression_data[tnbc_patients, "Biomarker_12"] + 2.5

# Convert to long format for visualization
expression_long <- as.data.frame(expression_data) %>%
  mutate(Patient_ID = rownames(.)) %>%
  gather(Biomarker, Expression, -Patient_ID) %>%
  left_join(clinical_data, by = "Patient_ID")

# Select top biomarkers for visualization
top_biomarkers <- c("Biomarker_1", "Biomarker_5", "Biomarker_12", "Biomarker_8")
plot_data <- expression_long %>%
  filter(Biomarker %in% top_biomarkers)

# Create biomarker expression plot
p5 <- ggplot(plot_data, aes(x = Subtype, y = Expression, fill = Subtype)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.5) +
  geom_jitter(alpha = 0.3, width = 0.2, size = 0.8) +
  scale_fill_manual(
    values = get_palette("vividset", type = "qualitative", n = 4)
  ) +
  facet_wrap(~Biomarker, scales = "free_y", ncol = 2) +
  labs(
    title = "Biomarker Expression Across Cancer Subtypes",
    subtitle = "Potential subtype-specific biomarkers for precision medicine",
    x = "Cancer Subtype",
    y = "Expression Level (log2 normalized)",
    fill = "Subtype"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
    plot.subtitle = element_text(size = 11, color = "#666666"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "bottom",
    strip.background = element_rect(fill = "#E3F2FD", color = NA)
  )

print(p5)

# Statistical summary
cat("\n📊 Biomarker Analysis Summary:\n")
cat("==============================\n")
subtype_counts <- table(clinical_data$Subtype)
print(subtype_counts)

cat("\nMean expression by subtype for key biomarkers:\n")
biomarker_summary <- plot_data %>%
  group_by(Biomarker, Subtype) %>%
  summarise(
    Mean_Expression = round(mean(Expression), 2),
    SD = round(sd(Expression), 2),
    .groups = 'drop'
  ) %>%
  arrange(Biomarker, desc(Mean_Expression))

print(biomarker_summary)

## ----data-management, eval = FALSE--------------------------------------------
# # Example of downloading reference data
# # Note: These functions require internet connection and may take time
# 
# # Download gene reference annotation
# gene_ref <- download_gene_ref(
#   species = "human",
#   build = "hg38",
#   feature_type = "gene"
# )
# 
# # Download GEO dataset
# geo_data <- download_geo_data(
#   geo_id = "GSE123456",
#   destdir = "data/geo_downloads"
# )
# 
# # Download pathway databases
# pathway_url <- "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.4/c2.cp.kegg.v7.4.symbols.gmt"
# download_url(
#   url = pathway_url,
#   dest = "data/pathways/kegg_pathways.gmt"
# )

## ----data-management-demo-----------------------------------------------------
# Demonstrate file organization for bioinformatics projects
cat("📁 Recommended Project Structure for Bioinformatics:\n")
cat("==================================================\n")
cat("project/\n")
cat("├── data/\n")
cat("│   ├── raw/                 # Original data files\n")
cat("│   ├── processed/           # Cleaned/normalized data\n")
cat("│   ├── reference/           # Genome annotations, databases\n")
cat("│   └── results/             # Analysis outputs\n")
cat("├── scripts/\n")
cat("│   ├── preprocessing/       # Data cleaning scripts\n")
cat("│   ├── analysis/            # Statistical analysis\n")
cat("│   └── visualization/       # Plotting scripts\n")
cat("├── docs/                    # Documentation, protocols\n")
cat("└── reports/                 # Final reports, publications\n\n")

# Demonstrate batch file handling
file_extensions <- c("fastq.gz", "bam", "vcf", "gmt", "gff3", "bed")
file_descriptions <- c(
  "Raw sequencing reads",
  "Aligned sequencing data",
  "Variant calls",
  "Gene set definitions",
  "Gene annotations",
  "Genomic intervals"
)

file_info <- data.frame(
  Extension = file_extensions,
  Description = file_descriptions,
  stringsAsFactors = FALSE
)

cat("🗂️ Common Bioinformatics File Types:\n")
print(file_info)

## ----best-practices-----------------------------------------------------------
cat("🔬 BIOINFORMATICS BEST PRACTICES\n")
cat("================================\n\n")

cat("📋 Data Management:\n")
cat("  • Use version control (Git) for all scripts\n")
cat("  • Document data provenance and processing steps\n")
cat("  • Implement checkpoints and intermediate file saves\n")
cat("  • Use consistent file naming conventions\n\n")

cat("🧬 Gene Identifier Handling:\n")
cat("  • Always validate gene ID conversions\n")
cat("  • Store original identifiers alongside converted ones\n")
cat("  • Document the genome build and annotation version\n")
cat("  • Handle missing/ambiguous identifiers gracefully\n\n")

cat("📊 Statistical Analysis:\n")
cat("  • Apply appropriate multiple testing corrections\n")
cat("  • Set significance thresholds before analysis\n")
cat("  • Report effect sizes along with p-values\n")
cat("  • Validate results with independent datasets when possible\n\n")

cat("🎨 Visualization Guidelines:\n")
cat("  • Use color-blind friendly palettes\n")
cat("  • Include appropriate scales and legends\n")
cat("  • Provide clear titles and axis labels\n")
cat("  • Consider publication requirements for figures\n")

## ----qc-checklist-------------------------------------------------------------
cat("✅ QUALITY CONTROL CHECKLIST\n")
cat("============================\n\n")

cat("🔍 Data Quality:\n")
cat("  [ ] Check for missing values and outliers\n")
cat("  [ ] Verify sample sizes and statistical power\n")
cat("  [ ] Validate gene identifier mappings\n")
cat("  [ ] Assess data distribution and normalization\n\n")

cat("📈 Analysis Validation:\n")
cat("  [ ] Cross-validate results with different methods\n")
cat("  [ ] Perform sensitivity analyses\n")
cat("  [ ] Check for batch effects and confounders\n")
cat("  [ ] Compare with known biological expectations\n\n")

cat("📊 Results Reporting:\n")
cat("  [ ] Include sample sizes and effect sizes\n")
cat("  [ ] Report confidence intervals\n")
cat("  [ ] Document software versions and parameters\n")
cat("  [ ] Provide supplementary data and code\n")

## ----complete-pipeline--------------------------------------------------------
cat("🔄 COMPLETE BIOINFORMATICS PIPELINE EXAMPLE\n")
cat("===========================================\n\n")

# Simulate a complete analysis workflow
pipeline_steps <- data.frame(
  Step = 1:8,
  Process = c(
    "Data Import & Quality Control",
    "Gene ID Conversion & Mapping",
    "Differential Expression Analysis",
    "Multiple Testing Correction",
    "Pathway Enrichment Analysis",
    "Gene Set Overlap Analysis",
    "Visualization & Plotting",
    "Results Export & Reporting"
  ),
  evanverse_Functions = c(
    "read_table_flex(), file_info()",
    "convert_gene_id(), replace_void()",
    "User analysis + evanverse utilities",
    "Built-in R functions",
    "gmt2df(), gmt2list()",
    "plot_venn(), combine_logic()",
    "plot_forest(), get_palette()",
    "write_xlsx_flex(), remind()"
  ),
  Estimated_Time = c("5-10 min", "10-15 min", "30-60 min", "5 min",
                     "15-30 min", "10-20 min", "20-40 min", "10-15 min")
)

print(pipeline_steps)

cat("\n⏱️ Total Estimated Pipeline Time: 2-4 hours\n")
cat("🎯 Key Success Factors:\n")
cat("  • Proper data validation at each step\n")
cat("  • Consistent identifier handling\n")
cat("  • Appropriate statistical methods\n")
cat("  • Clear documentation and visualization\n")

## ----bio-quick-ref, eval = FALSE----------------------------------------------
# # Gene identifier conversion
# convert_gene_id(genes, from = "symbol", to = "ensembl", species = "human")
# 
# # Pathway analysis
# pathways <- gmt2list("pathways.gmt")
# plot_venn(gene_sets, colors = get_palette("vividset"))
# 
# # Data visualization
# plot_forest(survival_data, hr_col = "HazardRatio")
# get_palette("gradient_rd_bu", type = "diverging", n = 11)
# 
# # Data management
# download_geo_data("GSE123456")
# read_table_flex("expression_data.txt")