## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- # function to simulate data simulate_data <- function(truth, covariates, n, random.seed, sigma_y = 1) { # set seed random.seed <- list(seed = random.seed, kind = "Mersenne-Twister", normal.kind = "Inversion", sample.kind = "Rounding") suppressWarnings(do.call("set.seed", random.seed)) # sample data n = 100, 500, 2500 selected_sample <- sample(seq_along(truth$mu.0), n, replace = FALSE) mu.0 = truth$mu.0[selected_sample] mu.1 = truth$mu.1[selected_sample] x = covariates[selected_sample,] # assign treatment n1=round(n/2) n0=n-n1 zind=sample(1:n,size=n1) z=numeric(n) z[zind]=1 mu = z * mu.1 + (1-z) * mu.0 y <- mu + sigma_y*rnorm(n) return( list( x = x, z = z, y = y, mu.1 = mu.1, mu.0 = mu.0 ) ) } # function to get true GATE trueGATE <- function(truth, tau, ngates = 5) { n = length(tau) fd_label = ntile(tau, ngates) gates = numeric(ngates) for (i in 1:ngates) { gates[i] = mean(truth$mu.1[fd_label == i] - truth$mu.0[fd_label == i]) } return(gates) } # function to get true GATE via cross-fitting compute_true_GATE <- function(truth, input_x, full_data, n_sample, nfolds, ngates, iter) { n_train = round(n_sample*(nfolds-1)/nfolds) train_data = simulate_data(truth, input_x, n_train, iter) cf_train = causal_forest( model.matrix(~.-1, data = train_data$x), train_data$y, train_data$z, num.trees = 10, num.threads = 1 ) tau_test = predict(cf_train, newdata = model.matrix(~.-1, data = full_data$x))$predictions gate_test = trueGATE(full_data, tau_test, ngates) return(tibble( iter = iter, group = 1:ngates, true_gate = gate_test # FIX: Rename here to match your join logic )) } make_synth_acic_population <- function(n_pop = 5000, seed = 1, p_cont = 10, p_bin = 10, sigma_mu = 1) { set.seed(seed) # correlated continuous covariates rho <- 0.4 Sigma <- rho ^ abs(outer(seq_len(p_cont), seq_len(p_cont), "-")) Xc <- matrix(rnorm(n_pop * p_cont), n_pop, p_cont) %*% chol(Sigma) colnames(Xc) <- paste0("x", seq_len(p_cont)) # binary covariates correlated with continuous ones Xb <- matrix(0, n_pop, p_bin) colnames(Xb) <- paste0("b", seq_len(p_bin)) for (j in seq_len(p_bin)) { lin <- 0.6 * Xc[, ((j - 1) %% p_cont) + 1] - 0.3 * Xc[, ((j) %% p_cont) + 1] + rnorm(n_pop, sd = 0.5) pr <- plogis(lin) Xb[, j] <- rbinom(n_pop, 1, pr) } # a couple categorical covariates (factors) cut3 <- cut(Xc[, 3], breaks = quantile(Xc[, 3], probs = c(0, 1/3, 2/3, 1)), include.lowest = TRUE, labels = c("A", "B", "C")) cut4 <- cut(Xc[, 4], breaks = quantile(Xc[, 4], probs = c(0, 0.25, 0.5, 0.75, 1)), include.lowest = TRUE, labels = c("L1", "L2", "L3", "L4")) x <- data.frame( Xc, Xb, cat1 = factor(cut3), cat2 = factor(cut4) ) # define mu0(x) and heterogeneous tau(x) with nonlinearities and interactions mu0 <- 0.5 * Xc[, 1] - 0.25 * (Xc[, 2]^2) + sin(Xc[, 3]) + 0.6 * Xb[, 1] + 0.4 * (cut3 == "B") - 0.3 * (cut3 == "C") + 0.5 * Xc[, 5] * Xb[, 2] + 0.3 * pmax(Xc[, 6], 0) tau <- 0.2 + 0.35 * Xc[, 1] + 0.25 * (Xc[, 7] > 0) + 0.2 * Xb[, 3] - 0.25 * Xc[, 2] * Xb[, 1] + 0.3 * (cut4 %in% c("L3", "L4")) + 0.15 * cos(Xc[, 8]) mu0 <- sigma_mu * mu0 tau <- sigma_mu * tau mu1 <- mu0 + tau truth <- list( mu.0 = as.numeric(mu0), mu.1 = as.numeric(mu1) ) list(x = x, truth = truth) } # function to compute GATE bounds with michael's original code compute_GATE_bounds <- function(truth, input_x, n_sample, nfolds, ngates, n_iter) { # get training data full_data = simulate_data(truth, input_x, n_sample, n_iter) # create folds folds = caret::createFolds(full_data$y, k = nfolds) indcv = numeric(n_sample) tauCV = matrix(0, nrow = n_sample, ncol = nfolds) for (i in 1:nfolds) { x_full = model.matrix(~.-1, data = full_data$x) x_train = x_full[-folds[[i]], ] y_train = full_data$y[-folds[[i]]] z_train = full_data$z[-folds[[i]]] x_test = x_full[folds[[i]], ] y_test = full_data$y[folds[[i]]] z_test = full_data$z[folds[[i]]] # fit causal forest cf_train = causal_forest( x_train, y_train, z_train, num.trees = 20, num.threads = 1 ) # compute tau tauCV[, i] = predict(cf_train, x_full)$predictions indcv[folds[[i]]] = rep(i, nrow(x_test)) } # compute GATE output = GATEcv(full_data$z, tauCV, full_data$y, indcv, ngates = ngates) return( list( group = 1:ngates, gate = output$gate, sd = output$sd, upper = output$gate + 1.96 * output$sd, lower = output$gate - 1.96 * output$sd ) ) } # function to compute GATE bounds with evalHTE package compute_GATE_bounds_pkg <- function(truth, input_x, n_sample, nfolds, ngates, n_iter, algs) { # get training data full_data = simulate_data(truth, input_x, n_sample, n_iter) # create data df = data.frame( full_data$x, y = full_data$y, z = full_data$z ) # specify formula cov_names = input_x %>% names() user_formula <- paste0("y ~ z*(", paste(cov_names, collapse = " + "), ")") # run with evalITR package fit <- estimate_hte( treatment = "z", form = user_formula, data = df, algorithms = algs, n_folds = nfolds, meta_learner = "slearner") # evaluate HTE est <- evaluate_hte(fit) gate_pkg <- summary(est)$GATE %>% mutate(iter = n_iter) return(gate_pkg) } # function to loop over n_sim for true GATE run_simluation_gate <- function(truth, input_x, full_data, n_sample, nfolds, ngates, n_sim) { # loop over n_sim results = future_map(1:n_sim, ~compute_true_GATE(truth, input_x, full_data, n_sample, nfolds, ngates, .x), .options = furrr_options(seed = 123) ) %>% bind_rows() return(results) } # function to loop over n_sim for gate sd run_simluation_gate_sd <- function(truth, input_x, n_sample, nfolds, ngates, n_sim, pkg = FALSE, algs = c("causal_forest")) { # if pkg == FALSE, compute GATE bounds using michael's original code if (pkg == FALSE) { results = future_map(1:n_sim, ~compute_GATE_bounds(truth, input_x, n_sample, nfolds, ngates, .x), .options = furrr_options(seed = 123) ) %>% bind_rows() } else { # if pkg == TRUE, compute GATE bounds using evalITR package results = future_map(1:n_sim, ~compute_GATE_bounds_pkg(truth, input_x, n_sample, nfolds, ngates, .x, algs = algs), .options = furrr_options(seed = 123) ) %>% bind_rows() } return(results) } ## ----------------------------------------------------------------------------- Sys.setenv( OMP_NUM_THREADS = 1, OPENBLAS_NUM_THREADS = 1, MKL_NUM_THREADS = 1, VECLIB_MAXIMUM_THREADS = 1 ) library(evalHTE) library(future) future::plan(future::sequential) library(furrr) library(magrittr) # load ACIC 2016 data library(grf) # causal_forest options(grf.num.threads = 1) library(dplyr) # ntile, left_join, mutate, summarise library(tibble) synth <- make_synth_acic_population(n_pop = 5000, seed = 28, p_cont = 10, p_bin = 10) input_x <- synth$x truth <- synth$truth full_data <- simulate_data(truth, input_x, nrow(input_x), 2021) # parameters n_sample = 50 n_sim = 5 n_folds = 2 n_gates = 2 # compute true GATE results_gate = run_simluation_gate(truth, input_x, full_data, n_sample, n_folds, n_gates, n_sim) # compute GATE bounds results_sd_pkg = run_simluation_gate_sd(truth, input_x, n_sample, n_folds, n_gates, n_sim, pkg = TRUE, algs = c("causal_forest")) # check coverage results_sd_pkg %>% left_join(results_gate, by = c("iter", "group")) %>% mutate( upper = estimate + 1.96 * std.deviation, lower = estimate - 1.96 * std.deviation, coverage = ((upper > true_gate) & (lower < true_gate)) * 1 ) %>% summarise( coverage = mean(coverage, na.rm = TRUE) )