## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE, eval = TRUE ) package_root <- if (file.exists("../DESCRIPTION")) ".." else "." if (requireNamespace("pkgload", quietly = TRUE) && file.exists(file.path(package_root, "DESCRIPTION"))) { pkgload::load_all(package_root, export_all = FALSE, helpers = FALSE, quiet = TRUE) } else { library(splitGraph) } or_empty <- function(x) { if (is.null(x)) character() else x } ## ----metadata----------------------------------------------------------------- meta <- data.frame( sample_id = c("S1", "S2", "S3", "S4", "S5", "S6"), subject_id = c("P1", "P1", "P2", "P3", "P4", "P2"), batch_id = c("B1", "B2", "B1", "B3", NA, "B1"), study_id = c("ST1", "ST1", "ST1", "ST2", "ST3", "ST2"), timepoint_id = c("T0", "T1", "T0", "T2", NA, "T1"), assay_id = c("RNAseq", "RNAseq", "RNAseq", "RNAseq", "Proteomics", "RNAseq"), featureset_id = c("FS_GLOBAL", "FS_GLOBAL", "FS_GLOBAL", "FS_GLOBAL", "FS_PROT", "FS_GLOBAL"), outcome_id = c("O_case", "O_case", "O_ctrl", "O_case", "O_ctrl", "O_ctrl"), stringsAsFactors = FALSE ) meta ## ----fast-path---------------------------------------------------------------- quick_graph <- graph_from_metadata( data.frame( sample_id = c("S1", "S2", "S3", "S4", "S5", "S6"), subject_id = c("P1", "P1", "P2", "P2", "P3", "P3"), batch_id = c("B1", "B2", "B1", "B2", "B1", "B2"), timepoint_id = c("T0", "T1", "T0", "T1", "T0", "T1"), time_index = c(0, 1, 0, 1, 0, 1), outcome_value = c(0, 1, 0, 1, 1, 0) ), graph_name = "quick_demo" ) quick_graph ## ----construction------------------------------------------------------------- meta <- ingest_metadata(meta, dataset_name = "VignetteDemo") sample_nodes <- create_nodes(meta, type = "Sample", id_col = "sample_id") subject_nodes <- create_nodes(meta, type = "Subject", id_col = "subject_id") batch_nodes <- create_nodes(meta, type = "Batch", id_col = "batch_id") study_nodes <- create_nodes(meta, type = "Study", id_col = "study_id") time_nodes <- create_nodes( data.frame( timepoint_id = c("T0", "T1", "T2"), time_index = c(0L, 1L, 2L), visit_label = c("baseline", "follow_up", "late_follow_up"), stringsAsFactors = FALSE ), type = "Timepoint", id_col = "timepoint_id", attr_cols = c("time_index", "visit_label") ) assay_nodes <- create_nodes( data.frame( assay_id = c("RNAseq", "Proteomics"), modality = c("transcriptomics", "proteomics"), platform = c("NovaSeq", "Orbitrap"), stringsAsFactors = FALSE ), type = "Assay", id_col = "assay_id", attr_cols = c("modality", "platform") ) featureset_nodes <- create_nodes( data.frame( featureset_id = c("FS_GLOBAL", "FS_PROT"), featureset_name = c("global_rna_signature", "proteomics_panel"), derivation_scope = c("per_dataset", "external"), feature_count = c(500L, 80L), stringsAsFactors = FALSE ), type = "FeatureSet", id_col = "featureset_id", attr_cols = c("featureset_name", "derivation_scope", "feature_count") ) outcome_nodes <- create_nodes( data.frame( outcome_id = c("O_case", "O_ctrl"), outcome_name = c("response", "response"), outcome_type = c("binary", "binary"), observation_level = c("subject", "subject"), stringsAsFactors = FALSE ), type = "Outcome", id_col = "outcome_id", attr_cols = c("outcome_name", "outcome_type", "observation_level") ) subject_edges <- create_edges( meta, "sample_id", "subject_id", "Sample", "Subject", "sample_belongs_to_subject" ) batch_edges <- create_edges( meta, "sample_id", "batch_id", "Sample", "Batch", "sample_processed_in_batch", allow_missing = TRUE ) study_edges <- create_edges( meta, "sample_id", "study_id", "Sample", "Study", "sample_from_study" ) time_edges <- create_edges( meta, "sample_id", "timepoint_id", "Sample", "Timepoint", "sample_collected_at_timepoint", allow_missing = TRUE ) assay_edges <- create_edges( meta, "sample_id", "assay_id", "Sample", "Assay", "sample_measured_by_assay" ) featureset_edges <- create_edges( meta, "sample_id", "featureset_id", "Sample", "FeatureSet", "sample_uses_featureset" ) outcome_edges <- create_edges( data.frame( subject_id = c("P1", "P2", "P3", "P4"), outcome_id = c("O_case", "O_ctrl", "O_case", "O_ctrl"), stringsAsFactors = FALSE ), "subject_id", "outcome_id", "Subject", "Outcome", "subject_has_outcome" ) precedence_edges <- create_edges( data.frame( from_timepoint = c("T0", "T1"), to_timepoint = c("T1", "T2"), stringsAsFactors = FALSE ), "from_timepoint", "to_timepoint", "Timepoint", "Timepoint", "timepoint_precedes" ) featureset_from_study <- create_edges( data.frame( featureset_id = "FS_GLOBAL", study_id = "ST1", stringsAsFactors = FALSE ), "featureset_id", "study_id", "FeatureSet", "Study", "featureset_generated_from_study" ) featureset_from_batch <- create_edges( data.frame( featureset_id = "FS_GLOBAL", batch_id = "B1", stringsAsFactors = FALSE ), "featureset_id", "batch_id", "FeatureSet", "Batch", "featureset_generated_from_batch" ) ## ----construction-output------------------------------------------------------ sample_nodes as.data.frame(sample_nodes)[, c("node_id", "node_type", "node_key", "label")] edge_preview <- do.call(rbind, lapply( list( subject_edges, batch_edges, study_edges, time_edges, assay_edges, featureset_edges, outcome_edges, precedence_edges, featureset_from_study, featureset_from_batch ), as.data.frame )) edge_preview[, c("from", "to", "edge_type")] ## ----graph-------------------------------------------------------------------- graph <- build_dependency_graph( nodes = list( sample_nodes, subject_nodes, batch_nodes, study_nodes, time_nodes, assay_nodes, featureset_nodes, outcome_nodes ), edges = list( subject_edges, batch_edges, study_edges, time_edges, assay_edges, featureset_edges, outcome_edges, precedence_edges, featureset_from_study, featureset_from_batch ), graph_name = "vignette_graph", dataset_name = attr(meta, "dataset_name") ) graph summary(graph) ## ----plot, fig.width = 7, fig.height = 5-------------------------------------- plot(graph) ## ----plot-options, eval = FALSE----------------------------------------------- # plot(graph, layout = "sugiyama") # alternative hierarchical layout # plot(graph, show_labels = FALSE) # hide labels on dense graphs # plot(graph, legend = FALSE) # suppress the legend # plot(graph, legend_position = "bottomright") # plot(graph, node_colors = c(Sample = "#000000")) ## ----validation--------------------------------------------------------------- validation <- validate_graph(graph) validation as.data.frame(validation)[, c("level", "severity", "code", "message")] ## ----strictness--------------------------------------------------------------- tryCatch( derive_split_constraints(graph, mode = "subject", samples = c("S1", "BAD")), error = function(e) e$message ) ## ----neighbors-and-paths------------------------------------------------------ neighbors_s1 <- query_neighbors(graph, node_ids = "sample:S1", direction = "out") neighbors_s1 as.data.frame(neighbors_s1)[, c("seed_node_id", "node_id", "node_type", "edge_type")] subject_outcome_path <- query_shortest_paths( graph, from = "sample:S1", to = "outcome:O_case", edge_types = c("sample_belongs_to_subject", "subject_has_outcome") ) subject_outcome_path as.data.frame(subject_outcome_path) ## ----projected-dependencies--------------------------------------------------- shared_dependencies <- detect_shared_dependencies( graph, via = c("Subject", "Batch", "FeatureSet") ) as.data.frame(shared_dependencies)[, c( "sample_id_1", "sample_id_2", "shared_node_type", "shared_node_id", "edge_type" )] dependency_components <- detect_dependency_components( graph, via = c("Subject", "Batch") ) as.data.frame(dependency_components) ## ----constraints-------------------------------------------------------------- subject_constraint <- derive_split_constraints(graph, mode = "subject") batch_constraint <- derive_split_constraints(graph, mode = "batch") study_constraint <- derive_split_constraints(graph, mode = "study") time_constraint <- derive_split_constraints(graph, mode = "time") strict_constraint <- derive_split_constraints( graph, mode = "composite", strategy = "strict", via = c("Subject", "Batch") ) rule_based_constraint <- derive_split_constraints( graph, mode = "composite", strategy = "rule_based", priority = c("batch", "study", "subject", "time") ) constraint_overview <- do.call(rbind, lapply( list( subject = subject_constraint, batch = batch_constraint, study = study_constraint, time = time_constraint, composite_strict = strict_constraint, composite_rule = rule_based_constraint ), function(x) { data.frame( strategy = x$strategy, groups = length(unique(x$sample_map$group_id)), warnings = if (is.null(x$metadata$warnings)) 0L else length(x$metadata$warnings), stringsAsFactors = FALSE ) } )) constraint_overview <- cbind(constraint = row.names(constraint_overview), constraint_overview) row.names(constraint_overview) <- NULL constraint_overview ## ----batch-constraint--------------------------------------------------------- batch_constraint as.data.frame(batch_constraint)[, c("sample_id", "group_id", "group_label", "explanation")] ## ----time-constraint---------------------------------------------------------- time_constraint as.data.frame(time_constraint)[, c("sample_id", "group_id", "timepoint_id", "order_rank")] ## ----composite-constraints---------------------------------------------------- strict_constraint as.data.frame(strict_constraint)[, c("sample_id", "group_id", "constraint_type")] rule_based_constraint as.data.frame(rule_based_constraint)[, c("sample_id", "group_id", "constraint_type", "group_label")] ## ----precedence-only---------------------------------------------------------- precedence_meta <- data.frame( sample_id = c("S1", "S2", "S3"), subject_id = c("P1", "P1", "P2"), study_id = c("ST1", "ST1", "ST2"), timepoint_id = c("T0", "T1", "T2"), stringsAsFactors = FALSE ) precedence_graph <- build_dependency_graph( nodes = list( create_nodes(precedence_meta, type = "Sample", id_col = "sample_id"), create_nodes(precedence_meta, type = "Subject", id_col = "subject_id"), create_nodes(precedence_meta, type = "Study", id_col = "study_id"), create_nodes( data.frame(timepoint_id = c("T0", "T1", "T2"), stringsAsFactors = FALSE), type = "Timepoint", id_col = "timepoint_id" ) ), edges = list( create_edges( precedence_meta, "sample_id", "subject_id", "Sample", "Subject", "sample_belongs_to_subject" ), create_edges( precedence_meta, "sample_id", "study_id", "Sample", "Study", "sample_from_study" ), create_edges( precedence_meta, "sample_id", "timepoint_id", "Sample", "Timepoint", "sample_collected_at_timepoint" ), create_edges( data.frame( from_timepoint = c("T0", "T1"), to_timepoint = c("T1", "T2"), stringsAsFactors = FALSE ), "from_timepoint", "to_timepoint", "Timepoint", "Timepoint", "timepoint_precedes" ) ), graph_name = "precedence_only_graph" ) precedence_time_constraint <- derive_split_constraints(precedence_graph, mode = "time") precedence_time_constraint$metadata$time_order_source as.data.frame(precedence_time_constraint)[, c("sample_id", "timepoint_id", "time_index", "order_rank")] ## ----split-spec--------------------------------------------------------------- split_spec <- as_split_spec(strict_constraint, graph = graph) split_spec as.data.frame(split_spec)[, c( "sample_id", "group_id", "batch_group", "study_group", "timepoint_id", "order_rank" )] split_spec_validation <- validate_split_spec(split_spec) split_spec_validation as.data.frame(split_spec_validation) ## ----risk-summary------------------------------------------------------------- risk_summary <- summarize_leakage_risks( graph, constraint = strict_constraint, split_spec = split_spec ) risk_summary as.data.frame(risk_summary)[, c("source", "severity", "category", "message")] ## ----case-study-1------------------------------------------------------------- subject_groups <- grouping_vector(subject_constraint) time_groups <- time_constraint$sample_map[, c("sample_id", "group_id", "timepoint_id", "order_rank")] subject_groups time_groups ## ----case-study-2------------------------------------------------------------- cross_study_issues <- as.data.frame(validation)[ as.data.frame(validation)$code == "subject_cross_study_overlap", c("severity", "code", "message") ] p2_shared <- detect_shared_dependencies( graph, via = "Subject", samples = c("S3", "S6") ) study_only_map <- study_constraint$sample_map[, c("sample_id", "group_id", "group_label")] strict_map <- strict_constraint$sample_map[, c("sample_id", "group_id", "constraint_type")] cross_study_issues as.data.frame(p2_shared) study_only_map[study_only_map$sample_id %in% c("S3", "S6"), ] strict_map[strict_map$sample_id %in% c("S3", "S6"), ] ## ----case-study-3------------------------------------------------------------- batch_missing <- batch_constraint$sample_map[ batch_constraint$sample_map$sample_id == "S5", c("sample_id", "group_id", "group_label", "explanation") ] rule_based_missing <- rule_based_constraint$sample_map[ rule_based_constraint$sample_map$sample_id == "S5", c("sample_id", "group_id", "constraint_type", "group_label", "explanation") ] split_spec_missing <- as.data.frame(split_spec)[ as.data.frame(split_spec)$sample_id == "S5", c("sample_id", "group_id", "batch_group", "study_group", "timepoint_id", "order_rank") ] batch_missing rule_based_missing split_spec_missing ## ----case-study-4------------------------------------------------------------- strategy_summary <- data.frame( constraint = c("subject", "batch", "study", "time", "composite_strict", "composite_rule"), groups = c( length(unique(subject_constraint$sample_map$group_id)), length(unique(batch_constraint$sample_map$group_id)), length(unique(study_constraint$sample_map$group_id)), length(unique(time_constraint$sample_map$group_id)), length(unique(strict_constraint$sample_map$group_id)), length(unique(rule_based_constraint$sample_map$group_id)) ), warnings = c( length(or_empty(subject_constraint$metadata$warnings)), length(or_empty(batch_constraint$metadata$warnings)), length(or_empty(study_constraint$metadata$warnings)), length(or_empty(time_constraint$metadata$warnings)), length(or_empty(strict_constraint$metadata$warnings)), length(or_empty(rule_based_constraint$metadata$warnings)) ), recommended_resampling = c( as_split_spec(subject_constraint, graph = graph)$recommended_resampling, as_split_spec(batch_constraint, graph = graph)$recommended_resampling, as_split_spec(study_constraint, graph = graph)$recommended_resampling, as_split_spec(time_constraint, graph = graph)$recommended_resampling, as_split_spec(strict_constraint, graph = graph)$recommended_resampling, as_split_spec(rule_based_constraint, graph = graph)$recommended_resampling ), stringsAsFactors = FALSE ) strategy_summary