## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
## ----install-BSET-------------------------------------------------------------
# install.packages("pak")
# pak::pak("PietroCarlotti/BSET")
## ----packages, message=FALSE, warning=FALSE-----------------------------------
library(ggplot2)
library(dplyr)
## ----quick-start-data---------------------------------------------------------
# Set the random seed for reproducibility
set.seed(1234)
# Sample size
n <- 50
# Binary covariate
X <- rbinom(n, 1, 0.5)
# Treatment assignment
Z <- rbinom(n, 1, 0.5)
# Primary outcome
Y <- 2 * Z -3 * X + rnorm(n)
# Surrogate outcome
S <- Y + rnorm(n)
df <- data.frame(
Y = Y,
S = S,
Z = Z,
X = X
)
## ----quick-start-no-X, eval=FALSE---------------------------------------------
# result_no_X <- BSET::BSET(
# data = df,
# Y = "Y",
# S = "S",
# Z = "Z",
# seed = 123,
# plot = TRUE
# )
# result_no_X$theta_posterior_plot
## ----quick-start-no-X-show, echo=FALSE, out.width="100%", fig.align='center', fig.cap="Posterior distribution of $\\theta$ from `BSET` without covariates. Dashed lines show estimated quantities: the Bayesian 95\\% credible interval upper bound and threshold $\\eta$ (blue shades), and the frequentist 95\\% confidence interval upper bound and threshold $\\varepsilon$ (orange shades)."----
knitr::include_graphics("BSET_quick_start_no_X_plot.png")
## ----quick-start-X, eval=FALSE------------------------------------------------
# result_X <- BSET::BSET(
# data = df,
# Y = "Y",
# S = "S",
# Z = "Z",
# X = "X",
# seed = 123,
# plot = TRUE
# )
# result_X$theta_posterior_plot
## ----quick-start-X-show, echo=FALSE, out.width="100%", fig.align='center', fig.cap="Posterior distribution of $\\theta$ from `BSET`, adjusted for a baseline covariate. Dashed lines show estimated quantities: the Bayesian 95\\% credible interval upper bound and threshold $\\eta$ (blue shades), and the frequentist 95\\% confidence interval upper bound and threshold $\\varepsilon$ (orange shades)."----
knitr::include_graphics("BSET_quick_start_X_plot.png")
## ----set-seed-----------------------------------------------------------------
# Set the random seed for reproducibility
set.seed(1234)
# Sample size and treatment assignment probability used throughout
n <- 50
p <- 0.5
## ----generate-data-no-X-------------------------------------------------------
# Mean vector of potential outcomes for the primary outcome and the surrogate
mu_star <- c(6, 6, 2.5, 2.5)
# Covariance matrix of potential outcomes for the primary outcome and the surrogate
Sigma_star <- kronecker(
diag(2),
matrix(
data = c(3, 3,
3, 3.1),
nrow = 2,
ncol = 2)
)
# Generate the data
data_no_X <- BSET::DGP_no_X(
n = n,
p = p,
mu_star = mu_star,
Sigma_star = Sigma_star,
model = "Gaussian"
)
## ----generate-data-X----------------------------------------------------------
# Binary covariate probability
q <- 0.5
# Mean vector for potential outcomes when X = 0 and when X = 1
mu_0 <- c(5, 5, 0, 0)
mu_1 <- c(5, -5, 0, -10)
# Covariance matrix for potential outcomes when X = 0 and when X = 1
Sigma_0 <- kronecker(
diag(2),
matrix(
data = c(1, 1,
1, 2),
nrow = 2,
ncol = 2)
)
Sigma_1 <- kronecker(
diag(2),
matrix(
data = c(1, 1,
1, 2),
nrow = 2,
ncol = 2)
)
# Generate the data
data_X <- BSET::DGP_X_binary(
n = n,
p = p,
q = q,
mu_0 = mu_0,
mu_1 = mu_1,
Sigma_0 = Sigma_0,
Sigma_1 = Sigma_1
)
## ----compute-delta-theta-no-X-------------------------------------------------
# Compute delta
delta_no_X <- BSET::compute_delta(MC_data = data_no_X)
# Compute theta
theta_no_X <- BSET::compute_theta(MC_data = data_no_X)
## ----compute-delta-theta-X----------------------------------------------------
# Compute delta
delta_X <- BSET::compute_delta(MC_data = data_X)
# Compute theta
theta_X <- BSET::compute_theta(MC_data = data_X)
## ----delta-theta-table, echo=FALSE--------------------------------------------
delta_theta_table <- data.frame(
Setting = c("No X (Perfect Surrogate)", "Binary X"),
"$\\widehat{\\delta}$" = c(delta_no_X$delta, delta_X$delta),
"$\\widehat{\\theta}$" = c(theta_no_X$theta, theta_X$theta),
check.names = FALSE
)
knitr::kable(
delta_theta_table,
caption = "Estimated values of $\\delta$ and $\\theta$ in the two settings.",
escape = FALSE,
booktabs = TRUE,
align = "ccc",
digits = 3
)
## ----compute-delta-theta-true-code, eval=FALSE--------------------------------
# # Compute Monte Carlo estimands (based on 1,000,000 samples) — this is slow
# estimands_Parast_et_al_2024 <- BSET::compute_estimands_Parast_et_al_2024(n_MC = 1000000)
# estimands_Carlotti_and_Parast_2026 <- BSET::compute_estimands_Carlotti_and_Parast_2026(n_MC = 1000000)
## ----compute-delta-theta-true-------------------------------------------------
# Load precomputed Monte Carlo estimands (based on 1,000,000 samples)
estimands_Parast_et_al_2024 <- BSET::estimands_Parast_et_al_2024
estimands_Carlotti_and_Parast_2026 <- BSET::estimands_Carlotti_and_Parast_2026
## ----estimands-Parast-et-al-2024, echo=FALSE----------------------------------
estimands_Parast_et_al_2024_table <- data.frame(
Setting = c("Useless surrogate", "Perfect surrogate", "Imperfect surrogate", "Misspecified model"),
"$\\delta$" = estimands_Parast_et_al_2024$delta_MC,
"$\\theta$" = estimands_Parast_et_al_2024$theta_MC,
check.names = FALSE
)
knitr::kable(
estimands_Parast_et_al_2024_table,
caption = "Monte Carlo estimates of $\\delta$ and $\\theta$ for the simulation settings considered in Parast et al. (2024).",
booktabs = TRUE,
align = "ccc",
digits = 3
)
## ----estimands-Carlotti-and-Parast-2026, echo=FALSE---------------------------
estimands_Carlotti_and_Parast_2026_table <- data.frame(
Setting = c("Perfect surrogate"),
"$\\delta$" = estimands_Carlotti_and_Parast_2026$delta_MC,
"$\\theta$" = estimands_Carlotti_and_Parast_2026$theta_MC,
check.names = FALSE
)
knitr::kable(
estimands_Carlotti_and_Parast_2026_table,
caption = "Monte Carlo estimates of $\\delta$ and $\\theta$ for the simulation settings considered in Carlotti and Parast (2026).",
booktabs = TRUE,
align = "ccc",
digits = 3
)
## ----compute-BF-distribution--------------------------------------------------
# Hypothesized value of V_S under the null
V_S_zero <- 0.5
# Prior parameters for the Beta distribution
a <- 1
b <- 1
# Alternative hypothesis for the Bayes factor
BF_alternative <- "greater"
# Compute the distribution of the Bayes factor
BF_distribution <- BSET::compute_BF_distribution(
n = n,
V_S_true = V_S_zero,
V_S_zero = V_S_zero,
a = a,
b = b,
BF_alternative = BF_alternative
)
## ----BF-distribution-plot, echo=FALSE, fig.width=8, fig.height=4.5, fig.align='center', fig.cap="Distribution of the Bayes factor under the null hypothesis $V_S = 0.5$."----
# Log transform the Bayes factor
BF_distribution$log_BF_values <- log(BF_distribution$BF_values)
ggplot(BF_distribution, aes(x = log_BF_values, y = BF_PMF)) +
geom_col(fill = "steelblue", width = 0.07) +
labs(
x = expression(log(BF[n])),
y = "PMF"
) +
theme_minimal()
## ----compute-BF-alpha---------------------------------------------------------
# Type I error rate
alpha <- 0.05
# Compute the critical value of the Bayes factor corresponding to alpha
BF_alpha <- BF_distribution %>%
dplyr::filter(BF_CDF >= 1 - alpha) %>%
dplyr::slice(1) %>%
dplyr::pull(BF_values)
## ----compute-V_S-star---------------------------------------------------------
# Type II error rate
beta <- 0.2
# Compute the value of v_S that satisfies the power constraint
V_S_star <- BSET::compute_V_S_star(
n = n,
alpha = alpha,
beta = beta,
V_S_zero = V_S_zero,
a = a,
b = b,
BF_alternative = BF_alternative,
root_tolerance = 1e-16
)$V_S_star
## ----compute-eta--------------------------------------------------------------
# Hypothesized value of the treatment effect on the primary outcome
v_Y <- estimands_Parast_et_al_2024$V_Y_MC[2]
# Compute the validation threshold eta
eta <- max(v_Y - V_S_star, 0)
## ----BSET-no-X-data-----------------------------------------------------------
# Prepare the data for BSET (no covariates)
BSET_no_X_data <- data.frame(
Y = data_no_X$P_observed[, "Y"],
S = data_no_X$P_observed[, "S"],
Z = data_no_X$Z
)
## ----BSET-no-X-posterior-sampling-parameters----------------------------------
# Posterior sampling parameters
n_chains <- 2
n_iter <- 2000
burn_in_ratio <- 0.25
## ----BSET-no-X-prior-parameters-----------------------------------------------
# Prior parameters
mu_0 <- rep(0, 4)
Sigma_0 <- diag(4)
s <- rep(1, 4)
tau <- 1
## ----BSET-no-X, eval=FALSE----------------------------------------------------
# # Run the BSET procedure without adjusting for covariates
# BSET_no_X_results <- BSET::BSET(
# data = BSET_no_X_data,
# Y = "Y",
# S = "S",
# Z = "Z",
# delta_true = estimands_Parast_et_al_2024$delta_MC[2],
# theta_true = estimands_Parast_et_al_2024$theta_MC[2],
# seed = 123,
# n_chains = n_chains,
# n_iter = n_iter,
# burn_in_ratio = burn_in_ratio,
# a = a,
# b = b,
# alpha = alpha,
# beta = beta,
# V_S_zero = V_S_zero,
# BF_alternative = BF_alternative,
# root_tolerance = 1e-16,
# mu_0 = mu_0,
# Sigma_0 = Sigma_0,
# s = s,
# tau = tau,
# plot = TRUE,
# mute = TRUE,
# parallel = TRUE
# )
## ----BSET-no-X-load, include=FALSE--------------------------------------------
## ----BSET-no-X-plot, echo=FALSE, out.width="100%", fig.align='center', fig.cap="Posterior distribution of $\\theta$ from the BSET procedure without adjusting for covariates. Dashed lines show estimated quantities: the Bayesian 95\\% credible interval upper bound and threshold $\\eta$ (blue shades), and the frequentist 95\\% confidence interval upper bound and threshold $\\varepsilon$ (orange shades). Solid lines show the true values of $\\theta$ (purple) and $\\delta$ (pink)."----
knitr::include_graphics("BSET_no_X_plot.png")
## ----BSET-X-data--------------------------------------------------------------
# Prepare the data for BSET (with covariates)
BSET_X_data <- data.frame(
Y = data_X$P_observed[, "Y"],
S = data_X$P_observed[, "S"],
Z = data_X$Z,
X = data_X$X
)
## ----BSET-X-prior-parameters--------------------------------------------------
# Prior parameters for the regression coefficients of the potential outcomes on the covariates
d <- 2
mu_beta <- rep(0, d)
Sigma_beta <- 10*diag(d)
## ----BSET-X, eval=FALSE-------------------------------------------------------
# # Run the BSET procedure adjusting for covariates
# BSET_X_results <- BSET::BSET(
# data = BSET_X_data,
# Y = "Y",
# S = "S",
# Z = "Z",
# X = "X",
# delta_true = estimands_Carlotti_and_Parast_2026$delta_MC[1],
# theta_true = estimands_Carlotti_and_Parast_2026$theta_MC[1],
# seed = 123,
# n_chains = n_chains,
# n_iter = n_iter,
# burn_in_ratio = burn_in_ratio,
# a = a,
# b = b,
# alpha = alpha,
# beta = beta,
# V_S_zero = V_S_zero,
# BF_alternative = BF_alternative,
# root_tolerance = 1e-16,
# mu_beta = mu_beta,
# Sigma_beta = Sigma_beta,
# s = s,
# tau = tau,
# plot = TRUE,
# mute = TRUE,
# parallel = TRUE
# )
## ----BSET-X-load, include=FALSE-----------------------------------------------
## ----BSET-X-plot, echo=FALSE, out.width="100%", fig.align='center', fig.cap="Posterior distribution of $\\theta$ from the BSET procedure adjusting for covariates. Dashed lines show estimated quantities: the Bayesian 95\\% credible interval upper bound and threshold $\\eta$ (blue shades), and the frequentist 95\\% confidence interval upper bound and threshold $\\varepsilon$ (orange shades). Solid lines show the true values of $\\theta$ (purple) and $\\delta$ (pink)."----
knitr::include_graphics("BSET_X_plot.png")