## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE,
fig.align = "center",
fig.retina = 2,
fig.width = 8,
fig.height = 6
)
# Bootstrap 5 code-folding hook for pkgdown compatibility
# Usage: add `echo=FALSE, code_fold=TRUE` to any chunk header
knitr::knit_hooks$set(code_fold = function(before, options, envir) {
if (!before && isTRUE(options$code_fold)) {
id <- gsub("[^a-zA-Z0-9]", "", options$label)
code_lines <- knitr::knit_code$get(options$label)
code_text <- htmltools::htmlEscape(paste(code_lines, collapse = "\n"))
sprintf(
'
Code
\n',
id, id, id, code_text
)
}
})
# Initialize timing
timings <- list()
t_vignette_start <- proc.time()
## ----load-packages------------------------------------------------------------
library(forestsearch)
library(survival)
library(data.table)
library(ggplot2)
library(gt)
library(grf)
library(policytree)
library(doFuture)
# Optional packages for enhanced output
library(patchwork)
library(weightedsurv)
# Set ggplot theme
theme_set(theme_minimal(base_size = 12))
## ----data-setup---------------------------------------------------------------
# Load GBSG data (included in forestsearch package)
df.analysis <- gbsg
# Prepare analysis variables
df.analysis <- within(df.analysis, {
id <- seq_len(nrow(df.analysis))
time_months <- rfstime / 30.4375
grade3 <- ifelse(grade == "3", 1, 0)
treat <- hormon
})
# Define variable roles
confounders.name <- c("age", "meno", "size", "grade3", "nodes", "pgr", "er")
outcome.name <- "time_months"
event.name <- "status"
id.name <- "id"
treat.name <- "hormon"
# Display data structure
cat("Sample size:", nrow(df.analysis), "\n")
cat("Events:", sum(df.analysis[[event.name]]),
sprintf("(%.1f%%)\n", 100 * mean(df.analysis[[event.name]])))
cat("Baseline factors:", paste(confounders.name, collapse = ", "), "\n")
## ----baseline-table-----------------------------------------------------------
create_summary_table(
data = df.analysis,
treat_var = treat.name,
table_title = "GBSG Baseline Characteristics by Treatment Arm",
vars_continuous = c("age", "nodes", "size", "er", "pgr"),
vars_categorical = c("grade", "meno"),
font_size = 12
)
## ----km-itt, fig.width=8, fig.height=5----------------------------------------
# Prepare counting process data for KM plot
dfcount <- df_counting(
df = df.analysis,
by.risk = 6,
tte.name = outcome.name,
event.name = event.name,
treat.name = treat.name
)
# Plot with confidence intervals and log-rank test
plot_weighted_km(
dfcount,
conf.int = TRUE,
show.logrank = TRUE,
ymax = 1.05,
xmed.fraction = 0.775,
ymed.offset = 0.125
)
## ----grf-analysis-------------------------------------------------------------
t0 <- proc.time()
grf_est <- grf.subg.harm.survival(
data = df.analysis,
confounders.name = confounders.name,
outcome.name = outcome.name,
event.name = event.name,
id.name = id.name,
treat.name = treat.name,
maxdepth = 2,
n.min = 60,
dmin.grf = 12,
frac.tau = 0.6,
details = TRUE,
return_selected_cuts_only = FALSE
)
timings$grf <- (proc.time() - t0)["elapsed"]
## ----grf-trees, fig.width=10, fig.height=4------------------------------------
# Display policy trees
# leaf1 = recommend control, leaf2 = recommend treatment
oldpar <- par(mfrow = c(1, 2))
plot(grf_est$tree1, leaf.labels = c("Control", "Treat"), main = "Depth 1")
plot(grf_est$tree2, leaf.labels = c("Control", "Treat"), main = "Depth 2")
par(oldpar)
## ----parallel-setup-----------------------------------------------------------
# Detect available cores (limited to 2 cores for CRAN checks)
n_cores <- 2
n_cores_total <- parallel::detectCores()
cat("Using", n_cores, "of", n_cores_total, "total cores for parallel processing")
## ----forestsearch-main, fig.width=10, fig.height=7----------------------------
t0 <- proc.time()
fs <- forestsearch(
df.analysis,
confounders.name = confounders.name,
outcome.name = outcome.name,
treat.name = treat.name,
event.name = event.name,
id.name = id.name,
# Threshold parameters (per León et al. 2024)
hr.threshold = 1.25,
hr.consistency = 1.0,
pconsistency.threshold = 0.80,
stop_threshold = 0.80,
# Search configuration
sg_focus = "hr",
max_subgroups_search = 3,
use_twostage = TRUE,
# Factor selection
use_grf = TRUE,
return_selected_cuts_only = TRUE,
use_lasso = TRUE,
cut_type = "default",
# Subgroup constraints
maxk = 2,
n.min = 60,
d0.min = 12,
d1.min = 12,
# Consistency evaluation
fs.splits = 100,
# Parallel processing
parallel_args = list(
plan = "multisession",
workers = n_cores,
show_message = TRUE
),
# Output options
showten_subgroups = TRUE,
details = TRUE,
plot.sg = TRUE
)
plan("sequential")
timings$forestsearch <- (proc.time() - t0)["elapsed"]
cat("\nForestSearch completed in",
round(timings$forestsearch, 1), "seconds\n")
## ----fs-results---------------------------------------------------------------
# Generate results tables
res_tabs <- sg_tables(fs, ndecimals = 3, which_df = "est")
# Display top subgroups meeting criteria
res_tabs$sg10_out
## ----fs-estimates-------------------------------------------------------------
# ITT and subgroup estimates
res_tabs$tab_estimates
## ----fs-subgroup--------------------------------------------------------------
cat("Identified subgroup (H):", paste(fs$sg.harm, collapse = " & "), "\n")
cat("Subgroup size:", sum(fs$df.est$treat.recommend == 0),
sprintf("(%.1f%% of ITT)\n",
100 * mean(fs$df.est$treat.recommend == 0)))
## ----bootstrap, eval=TRUE-----------------------------------------------------
# Number of bootstrap iterations
# Use 500-2000 for production; reduced here for vignette
NB <- 2
t0 <- proc.time()
fs_bc <- forestsearch_bootstrap_dofuture(
fs.est = fs,
nb_boots = NB,
show_three = FALSE,
details = FALSE,
parallel_args = list(
plan = "multisession",
workers = n_cores,
show_message = TRUE
)
)
plan("sequential")
timings$bootstrap <- (proc.time() - t0)["elapsed"]
cat("\nBootstrap completed in",
round(timings$bootstrap / 60, 1), "minutes\n")
## ----bootstrap-summary--------------------------------------------------------
# Comprehensive summary with diagnostics
summaries <- summarize_bootstrap_results(
sgharm = fs$sg.harm,
boot_results = fs_bc,
create_plots = TRUE,
est.scale = "hr"
)
# Display bias-corrected estimates table
summaries$table
## ----fs-results_figure, fig.width=10, fig.height=7, fig.cap="Kaplan-Meier survival curves by identified subgroup"----
km_result <- plot_sg_weighted_km(
fs.est = fs,
outcome.name = "time_months",
event.name = "status",
treat.name = "hormon",
show.logrank = FALSE,
conf.int = TRUE,
by.risk = 12,
show.cox = FALSE, show.cox.bc = TRUE,
fs_bc = fs_bc,
hr_bc_position = "topright"
)
## ----event-summary------------------------------------------------------------
# note that default required minimum events is 12 for subgroup candidate
# Here we evaluate frequency of subgroup candidates in bootstrap samples less than 15
event_summary <- summarize_bootstrap_events(fs_bc, threshold = 15)
## ----bootstrap-diagnostics----------------------------------------------------
# Quality metrics
summaries$diagnostics_table_gt
## ----subgroup-agreement-------------------------------------------------------
# Agreement with original analysis
if (!is.null(summaries$subgroup_summary$original_agreement)) {
summaries$subgroup_summary$original_agreement
}
# Factor presence across bootstrap iterations
if (!is.null(summaries$subgroup_summary$factor_presence)) {
summaries$subgroup_summary$factor_presence
}
## ----bootstrap-plots, fig.width=10, fig.height=4, eval=TRUE-------------------
if (!is.null(summaries$plots)) {
summaries$plots$H_distribution + summaries$plots$Hc_distribution
}
## ----kfold-cv, eval = TRUE----------------------------------------------------
# 10-fold CV with multiple iterations
# Use Ksims >= 50 for production
Ksims <- 1
t0 <- proc.time()
fs_kfold <- forestsearch_tenfold(
fs.est = fs,
sims = Ksims,
Kfolds = 2,
details = FALSE,
parallel_args = list(
plan = "multisession",
workers = n_cores,
show_message = FALSE
)
)
plan("sequential")
timings$kfold <- (proc.time() - t0)["elapsed"]
metrics_tables <- cv_metrics_tables(fs_kfold)
metrics_tables
## ----oob-cv, eval = FALSE-----------------------------------------------------
# t0 <- proc.time()
#
# fs_OOB <- forestsearch_Kfold(
# fs.est = fs,
# details = FALSE,
# Kfolds = round(nrow(df.analysis)/100,0), # N-fold = leave-one-out
# parallel_args = list(
# plan = "multisession",
# workers = n_cores,
# show_message = TRUE
# )
# )
#
# plan("sequential")
# timings$oob <- (proc.time() - t0)["elapsed"]
#
# # Summarize OOB results
# cv_out <- forestsearch_KfoldOut(
# res = fs_OOB,
# details = FALSE,
# outall = TRUE
# )
#
# tables <- cv_summary_tables(cv_out)
#
# tables$combined_table
#
# tables$metrics_table
#
#
#
#
## ----forest-plot, fig.width=18, fig.height=12, fig.cap="Subgroup forest plot including identified subgroups"----
# Define reference subgroups for comparison
subgroups <- list(
age_gt65 = list(
subset_expr = "age > 65",
name = "Age > 65",
type = "reference"
),
age_le65 = list(
subset_expr = "age <= 65",
name = "Age ≤ 65",
type = "reference"
),
pgr_positive = list(
subset_expr = "pgr > 0",
name = "PgR > 0",
type = "reference"
),
pgr_negative = list(
subset_expr = "pgr <= 0",
name = "PgR ≤ 0",
type = "reference"
)
)
my_theme <- create_forest_theme(base_size = 24,
footnote_fontsize = 17, cv_fontsize = 22)
# Create forest plot
# Include fs_kfold and fs_OOB if available for CV metrics
result <- plot_subgroup_results_forestplot(
fs_results = list(
fs.est = fs,
fs_bc = fs_bc,
fs_OOB = NULL,
fs_kfold = fs_kfold
),
df_analysis = df.analysis,
subgroup_list = subgroups,
outcome.name = outcome.name,
event.name = event.name,
treat.name = treat.name,
E.name = "Hormonal",
C.name = "Chemo",
ci_column_spaces = 25,
xlog = TRUE,
theme = my_theme
)
# Option 2: Custom sizing
render_forestplot(result)
## ----KMdiffs, fig.width = 8, fig.height = 6, fig.align="center"---------------
# Add additional subgroups along with ITT and identified subgroups
ref_sgs <- list(
age_young = list(subset_expr = "age < 65", color = "brown"),
age_old = list(subset_expr = "age >= 65", color = "orange")
)
plot_km_band_forestsearch(
df = df.analysis,
fs.est = fs,
ref_subgroups = ref_sgs,
outcome.name = outcome.name,
event.name = event.name,
treat.name = treat.name,
draws_band = 20
)
# # Example with more subgroups
# ref_sgs <- list(
# pgr_positive = list(subset_expr = "pgr > 0", color ="green"),
# pgr_negative = list(subset_expr = "pgr <= 0", color = "purple"),
# age_young = list(subset_expr = "age < 65", color = "brown"),
# age_old = list(subset_expr = "age >= 65", color = "orange")
# )
## ----summary-findings---------------------------------------------------------
# Extract key results
cat("=" %>% rep(60) %>% paste(collapse = ""), "\n")
cat("FORESTSEARCH ANALYSIS SUMMARY\n")
cat("=" %>% rep(60) %>% paste(collapse = ""), "\n\n")
cat("Dataset: GBSG (N =", nrow(df.analysis), ")\n")
cat("Outcome: Recurrence-free survival\n\n")
cat("ITT Analysis:\n")
cat(" HR (95% CI): 0.69 (0.54, 0.89)\n\n")
cat("Identified Subgroup (H):\n")
cat(" Definition:", paste(fs$sg.harm, collapse = " & "), "\n")
cat(" Size:", sum(fs$df.est$treat.recommend == 0),
sprintf("(%.1f%%)\n", 100 * mean(fs$df.est$treat.recommend == 0)))
cat(" Unadjusted HR:", sprintf("%.2f", exp(fs$grp.consistency$out_sg$result$hr[1])), "\n")
cat("\nComplement Subgroup (Hc):\n")
cat(" Size:", sum(fs$df.est$treat.recommend == 1),
sprintf("(%.1f%%)\n", 100 * mean(fs$df.est$treat.recommend == 1)))
## ----timing-summary, echo=FALSE, code_fold=TRUE-------------------------------
timings$total <- (proc.time() - t_vignette_start)["elapsed"]
timing_df <- data.frame(
Analysis = c("GRF", "ForestSearch", "Bootstrap", "Total"),
Seconds = c(
timings$grf,
timings$forestsearch,
timings$bootstrap,
timings$total
)
)
timing_df$Minutes <- timing_df$Seconds / 60
gt(timing_df) |>
tab_header(title = "Computational Timing") |>
fmt_number(columns = c(Seconds, Minutes), decimals = 1) |>
cols_label(
Analysis = "Component",
Seconds = "Time (sec)",
Minutes = "Time (min)"
)
## ----session-info-------------------------------------------------------------
sessionInfo()