## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = FALSE,
  tidy = FALSE,
  comment = "#>",
  progress = FALSE,
  echo = TRUE,
  dev = 'pdf',
  dpi = 120,
  fig.width = 6,
  fig.height = 4,
  out.width = "100%"
)

## ----setup--------------------------------------------------------------------
library(geommc)

## ----basic-usage, eval=FALSE--------------------------------------------------
# result <- geomc.vs(
#   X = X,              # n \times p predictor matrix
#   y = y               # n-vector response
# )

## ----data-1-------------------------------------------------------------------
# Set parameters
n <- 50          # Sample size
p <- 100         # Number of predictors
nonzero <- 3     # Number of true predictors
trueidx <- 1:3   # Indices of true predictors
nonzero.value <- 4  # Effect size

# True regression coefficients
TrueBeta <- numeric(p)
TrueBeta[trueidx] <- nonzero.value

# Generate correlated predictors
rho <- 0.5
set.seed(3)
xone <- matrix(rnorm(n * p), n, p)
X <- sqrt(1 - rho) * xone + sqrt(rho) * rnorm(n)

# Generate response
intercept <- 0.5
sigma_true <- 1
y <- intercept + X %*% TrueBeta + rnorm(n, 0, sigma_true)

cat("Data dimensions:\n")
cat("  n (observations):", n, "\n")
cat("  p (predictors):", p, "\n")
cat("  Number of true non-zero predictors:", nonzero, "\n")
cat("  True coefficient indices:", paste(trueidx, collapse = ", "), "\n")
cat("  True coefficient values:", nonzero.value, "\n")

## ----run-vs-1-----------------------------------------------------------------
# Run geomc.vs
set.seed(123)
result <- geomc.vs(
  X = X,
  y = y,
  model.summary = TRUE # Compute model summaries
)

## ----results-structure--------------------------------------------------------
names(result)

## ----mip-1--------------------------------------------------------------------
# Top 10 variables by MIP
top_vars <- order(result$mip, decreasing = TRUE)[1:10]

cat("Top 10 variables by Marginal Inclusion Probability:\n\n")
mip_df <- data.frame(
  Variable = top_vars,
  MIP = round(result$mip[top_vars], 4),
  True = ifelse(top_vars %in% trueidx, "Yes", "No")
)
print(mip_df, row.names = FALSE)

## ----plot-mip-1, fig.width=6, fig.height=4------------------------------------
par(mfrow = c(1, 1))

# Plot MIPs
plot(1:p, result$mip, 
     type = "h", lwd = 2, col = "gray",
     xlab = "Variable Index", 
     ylab = "MIP",
     main = "Marginal Inclusion Probabilities",
     ylim = c(0, 1))

# Highlight true variables
points(trueidx, result$mip[trueidx], 
       col = "red", pch = 19, cex = 2)

# Add threshold line
abline(h = 0.5, col = "blue", lty = 2, lwd = 2)

legend("topright", 
       legend = c("True variables", "MIP > 0.5 threshold"),
       col = c("red", "blue"), 
       pch = c(19, NA), 
       lty = c(NA, 2),
       lwd = 2)

## ----median-model-1-----------------------------------------------------------
cat("Median Probability Model:\n")
cat("  Number of variables selected:", length(result$median.model), "\n")
cat("  Variable indices:", paste(result$median.model, collapse = ", "), "\n\n")

# Check if we recovered the true model
correctly_identified <- all(trueidx %in% result$median.model)
false_positives <- setdiff(result$median.model, trueidx)
false_negatives <- setdiff(trueidx, result$median.model)

cat("Model Recovery:\n")
cat("  All true variables identified:", correctly_identified, "\n")
cat("  Number of false positives:", length(false_positives), "\n")
cat("  Number of false negatives:", length(false_negatives), "\n")

if (length(false_positives) > 0) {
  cat("  False positive indices:", paste(false_positives, collapse = ", "), "\n")
}

## ----coef-estimates-1---------------------------------------------------------
# Compare estimated vs true coefficients for true variables
cat("Coefficient Estimates (True Variables):\n\n")
coef_comparison <- data.frame(
  Variable = trueidx,
  True_Beta = TrueBeta[trueidx],
  Posterior_Mean = round(result$beta.mean[(1+trueidx)], 3),
  Beta_WAM = round(result$beta.wam[(1+trueidx)], 3),
  MIP = round(result$mip[trueidx], 4)
)
print(coef_comparison)
intercept_comparison<- data.frame(
  True_intercept =intercept,
  Posterior_Mean = round(result$beta.mean[1], 3),
  Beta_WAM = round(result$beta.wam[1], 3)
)
print(intercept_comparison)

## ----model-space, fig.width=7, fig.height=5-----------------------------------
# Model sizes visited
model_sizes <- apply(result$samples, 1, sum)

hist(model_sizes, breaks = 20, 
     main = "Distribution of Model Sizes Visited",
     xlab = "Number of Variables in Model",
     col = "lightblue", border = "white")
abline(v = nonzero, col = "red", lwd = 3, lty = 2)
legend("topright", 
       legend = "True model size",
       col = "red", lty = 2, lwd = 2)

cat("\nModel Size Summary:\n")
cat("  Mean model size:", round(mean(model_sizes), 2), "\n")
cat("  Median model size:", median(model_sizes), "\n")
cat("  True model size:", nonzero, "\n")

## ----generate-sparse-data-----------------------------------------------------
# Problem dimensions
n <- 80        # Number of documents
p <- 200       # Vocabulary size (number of words)
sparsity <- 0.95  # 95% of entries are zero

# True important words (only 5 words matter)
true_words <- c(10, 25, 50, 100, 150)
true_effects <- c(2.5, -2.0, 3.0, -1.8, 2.2)

# Create sparse design matrix (document-term matrix)
# Most words don't appear in most documents
X_dense <- matrix(0, n, p)

# Add word counts (Poisson distributed when word appears)
set.seed(3)
for (i in 1:n) {
  # Each document has about 5% of words present
  present_words <- sample(1:p, size = round(p * (1 - sparsity)))
  X_dense[i, present_words] <- rpois(length(present_words), lambda = 2)
}

# Ensure true words appear with higher frequency
for (word in true_words) {
  appears_in <- sample(1:n, size = round(0.7 * n))
  X_dense[appears_in, word] <- rpois(length(appears_in), lambda = 3)
}

# Remove zero-variance columns
col_vars <- apply(X_dense, 2, var)
nonzero_cols <- which(col_vars > 0)

cat("Original number of columns:", p, "\n")
cat("Columns with zero variance:", sum(col_vars == 0), "\n")
cat("Columns retained:", length(nonzero_cols), "\n\n")

# Keep mapping from new to old indices
old_to_new <- rep(NA, p)
old_to_new[nonzero_cols] <- 1:length(nonzero_cols)

# Update X_dense to only include non-zero variance columns
X_dense <- X_dense[, nonzero_cols, drop = FALSE]

# Update true_words indices to reflect new column positions
true_words_old <- true_words
true_words <- match(true_words_old, nonzero_cols)
true_words <- true_words[!is.na(true_words)]  # Keep only those that survived

cat("Original true words:", paste(true_words_old, collapse = ", "), "\n")
cat("New true words indices:", paste(true_words, collapse = ", "), "\n")
cat("True words retained:", length(true_words), "out of", length(true_words_old), "\n\n")

# Update true_effects to match retained true words
true_effects <- true_effects[!is.na(match(true_words_old, nonzero_cols))]

# Update p to reflect actual number of columns
p <- ncol(X_dense)
cat("Updated p:", p, "\n\n")

library(Matrix)
# Convert to sparse matrix (dgCMatrix)
X_sparse <- Matrix(X_dense, sparse = TRUE)

# Check sparsity
actual_sparsity <- 1 - (nnzero(X_sparse) / (n * p))
cat("Actual sparsity:", round(actual_sparsity * 100, 1), "%\n")
cat("Non-zero elements:", nnzero(X_sparse), "\n")
cat("Matrix class:", class(X_sparse), "\n")

## ----visualize-sparse, fig.width=6, fig.height=4------------------------------
# Visualize the sparse matrix pattern
image(
    x = 1:min(100, p),
    y = 1:min(50, n),
    z = as.matrix(t(X_sparse[1:min(50, n), 1:min(100, p)])),
    main = "Sparse Matrix Pattern (50 docs × 100 words)",
    ylab = "Documents",
    xlab = "Words",
    col = colorRampPalette(c("white", "blue", "darkblue"))(100)
)

## ----memory-comparison--------------------------------------------------------
# Compare memory usage
size_sparse <- object.size(X_sparse)
size_dense <- object.size(X_dense)

cat("Memory usage:\n")
cat("  Sparse matrix:", format(size_sparse, units = "Kb"), "\n")
cat("  Dense matrix:", format(size_dense, units = "Kb"), "\n")
cat("  Reduction:", 
    round(100 * (1 - as.numeric(size_sparse) / as.numeric(size_dense)), 1), 
    "%\n")

## ----generate-response--------------------------------------------------------
# Create coefficient vector (now with updated p)
beta_true <- numeric(p)
beta_true[true_words] <- true_effects

# Generate response (document sentiment/category)
set.seed(3)
linear_predictor <- X_sparse %*% beta_true
y <- as.numeric(linear_predictor) + rnorm(n, sd = 1.5)

cat("Response variable summary:\n")
print(summary(y))

## ----run-geomc-vs-------------------------------------------------------------
# Run variable selection
set.seed(123)
result <- geomc.vs(
  X = X_sparse,         # Sparse dgCMatrix
  y = y,
  model.summary = TRUE # Compute model summaries (default: FALSE)
)

## ----samples-explained--------------------------------------------------------
# Dimensions
cat("Sample matrix dimensions:", dim(result$samples), "\n")
cat("(rows = iterations, columns = variables)\n\n")

# First few iterations
cat("First 5 iterations (first 10 variables):\n")
print(result$samples[1:5, 1:20])

# Each row is a model
cat("\nIteration 1 includes variables:", which(result$samples[1, ] == 1), "\n")
cat("Iteration 2 includes variables:", which(result$samples[2, ] == 1), "\n")

# The proportion of accepted proposals
cat("\nAcceptance rate:", round(result$acceptance.rate, 3), "\n")

## ----logpost, fig.width=6, fig.height=4---------------------------------------
# Trace plot of log posterior
plot(result$log.p, type = "l",
     xlab = "Iteration", ylab = "Log Posterior",
     main = "Trace of Log Posterior Probability")

cat("\nLog posterior summary:\n")
cat("  Min:", round(min(result$log.p), 2), "\n")
cat("  Max:", round(max(result$log.p), 2), "\n")
cat("  Mean:", round(mean(result$log.p), 2), "\n")

## ----mip-analysis-------------------------------------------------------------
# Identify top words by MIP
top_n <- 20
top_indices <- order(result$mip, decreasing = TRUE)[1:top_n]

cat("Top", top_n, "words by Marginal Inclusion Probability:\n\n")
cat("(Note: Word indices are in the reduced matrix space)\n\n")

mip_table <- data.frame(
  Rank = 1:top_n,
  Word_Index = top_indices,
  MIP = round(result$mip[top_indices], 4),
  True_Word = ifelse(top_indices %in% true_words, "Yes", "No"),
  True_Effect = ifelse(
    top_indices %in% true_words,
    sprintf("%.2f", beta_true[top_indices]),
    ""
  )
)
print(mip_table, row.names = FALSE)

# Check recovery
cat("\nRecovery of true words:\n")
for (i in seq_along(true_words)) {
  word_idx <- true_words[i]
  cat(sprintf("  Word %3d (original %3d): MIP = %.4f, Rank = %2d\n",
              word_idx,
              true_words_old[i],
              result$mip[word_idx],
              which(order(result$mip, decreasing = TRUE) == word_idx)))
}

## ----plot-mips, fig.width=7, fig.height=6-------------------------------------
par(mfrow = c(2, 1), mar = c(4, 4, 3, 1))

# Plot 1: All MIPs
plot(
  1:p, result$mip,
  type = "h", col = "gray70", lwd = 1,
  xlab = "Word Index (in reduced matrix)",
  ylab = "MIP",
  main = "Marginal Inclusion Probabilities (All Words)",
  ylim = c(0, 1)
)
points(true_words, result$mip[true_words], 
       col = "red", pch = 19, cex = 2)
abline(h = 0.5, col = "blue", lty = 2, lwd = 2)
legend("topright",
       legend = c("True word", "MIP > 0.5"),
       col = c("red", "blue"),
       pch = c(19, NA),
       lty = c(NA, 2),
       lwd = 2)


## ----plot-wmips, fig.width=7, fig.height=6------------------------------------
par(mfrow = c(2, 1), mar = c(4, 4, 3, 1))

# Plot 2: All WMIPs
plot(
  1:p, result$wmip,
  type = "h", col = "gray70", lwd = 1,
  xlab = "Word Index (in reduced matrix)",
  ylab = "WMIP",
  main = "Weighted Marginal Inclusion Probabilities (All Words)",
  ylim = c(0, 1)
)
points(true_words, result$wmip[true_words], 
       col = "red", pch = 19, cex = 2)
abline(h = 0.5, col = "blue", lty = 2, lwd = 2)
legend("topright",
       legend = c("True word", "WMIP > 0.5"),
       col = c("red", "blue"),
       pch = c(19, NA),
       lty = c(NA, 2),
       lwd = 2)


## ----model-performance--------------------------------------------------------
# Median probability model
selected_words <- result$median.model

cat("Median Probability Model:\n")
cat("  Words selected:", length(selected_words), "\n")
cat("  Word indices (in reduced matrix):", paste(selected_words, collapse = ", "), "\n\n")

# Confusion matrix
true_positive <- sum(true_words %in% selected_words)
false_positive <- length(selected_words) - true_positive
false_negative <- length(true_words) - true_positive
true_negative <- p - length(true_words) - false_positive

cat("Classification Performance:\n")
cat("  True Positives:", true_positive, "/", length(true_words), "\n")
cat("  False Positives:", false_positive, "\n")
cat("  False Negatives:", false_negative, "\n")
cat("  Sensitivity:", round(true_positive / length(true_words), 3), "\n")
cat("  Precision:", 
    round(true_positive / max(1, length(selected_words)), 3), "\n")

## ----wam-model-performance----------------------------------------------------
# Weighted average model
selected_words <- result$wam

cat("Weighted Average Model:\n")
cat("  Words selected:", length(selected_words), "\n")
cat("  Word indices (in reduced matrix):", paste(selected_words, collapse = ", "), "\n\n")

# Confusion matrix
true_positive <- sum(true_words %in% selected_words)
false_positive <- length(selected_words) - true_positive
false_negative <- length(true_words) - true_positive
true_negative <- p - length(true_words) - false_positive

cat("Classification Performance:\n")
cat("  True Positives:", true_positive, "/", length(true_words), "\n")
cat("  False Positives:", false_positive, "\n")
cat("  False Negatives:", false_negative, "\n")
cat("  Sensitivity:", round(true_positive / length(true_words), 3), "\n")
cat("  Precision:", 
    round(true_positive / max(1, length(selected_words)), 3), "\n")


## ----coefficient-estimates----------------------------------------------------
cat("Coefficient Estimates for True Words:\n\n")

coef_table <- data.frame(
  Word_New = true_words,
  Word_Original = true_words_old[1:length(true_words)],
  True_Effect = true_effects,
  Post_Mean = round(result$beta.mean[(1+true_words)], 3),
  Beta_WAM = round(result$beta.wam[(1+true_words)], 3),
  MIP = round(result$mip[true_words], 4),
  WMIP = round(result$wmip[true_words], 4),
  Selected = ifelse(true_words %in% selected_words, "Yes", "")
)
print(coef_table, row.names = FALSE)

## ----sparse-model-space, fig.width=7, fig.height=6----------------------------
# Sizes of visited models
model_sizes <- apply(result$samples, 1, sum)

hist(
  model_sizes,
  breaks = 30,
  main = "Distribution of Model Sizes Visited",
  xlab = "Number of Words in Model",
  ylab = "Frequency",
  col = "lightblue",
  border = "white"
)
abline(v = length(true_words), col = "red", lwd = 3, lty = 2)
abline(v = median(model_sizes), col = "blue", lwd = 2, lty = 2)
legend("topleft",
       legend = c(
         paste("True size =", length(true_words)),
         paste("Median =", median(model_sizes))
       ),
       col = c("red", "blue"),
       lty = 2,
       lwd = 2)

cat("\nModel size summary:\n")
cat("  Mean:", round(mean(model_sizes), 2), "\n")
cat("  Median:", median(model_sizes), "\n")
cat("  Range:", range(model_sizes), "\n")

## ----run-geomc-vs-asym--------------------------------------------------------
# Run variable selection
set.seed(123)
result_asym <- geomc.vs(
  X = X_sparse,         # Sparse dgCMatrix
  y = y,
  symm = FALSE,
  move.prob = c(0.4,0.4,0.2),
  model.summary = TRUE # Compute model summaries (default: FALSE)
)

## ----asym-analysis------------------------------------------------------------
cat("\nAcceptance rate:", round(result_asym$acceptance.rate, 3), "\n")
# Identify top words by MIP
top_n <- 20
top_indices <- order(result_asym$mip, decreasing = TRUE)[1:top_n]

cat("Top", top_n, "words by Marginal Inclusion Probability:\n\n")
cat("(Note: Word indices are in the reduced matrix space)\n\n")

mip_table <- data.frame(
  Rank = 1:top_n,
  Word_Index = top_indices,
  MIP = round(result_asym$mip[top_indices], 4),
  True_Word = ifelse(top_indices %in% true_words, "Yes", "No"),
  True_Effect = ifelse(
    top_indices %in% true_words,
    sprintf("%.2f", beta_true[top_indices]),
    ""
  )
)
print(mip_table, row.names = FALSE)

# Check recovery
cat("\nRecovery of true words:\n")
for (i in seq_along(true_words)) {
  word_idx <- true_words[i]
  cat(sprintf("  Word %3d (original %3d): MIP = %.4f, Rank = %2d\n",
              word_idx,
              true_words_old[i],
              result_asym$mip[word_idx],
              which(order(result_asym$mip, decreasing = TRUE) == word_idx)))
}

## ----asym-plot-mips, fig.width=7, fig.height=6--------------------------------
par(mfrow = c(2, 1), mar = c(4, 4, 3, 1))

# Plot 1: All MIPs
plot(
  1:p, result_asym$mip,
  type = "h", col = "gray70", lwd = 1,
  xlab = "Word Index (in reduced matrix)",
  ylab = "MIP",
  main = "Marginal Inclusion Probabilities (All Words)",
  ylim = c(0, 1)
)
points(true_words, result_asym$mip[true_words], 
       col = "red", pch = 19, cex = 2)
abline(h = 0.5, col = "blue", lty = 2, lwd = 2)
legend("topright",
       legend = c("True words", "MIP > 0.5"),
       col = c("red", "blue"),
       pch = c(19, NA),
       lty = c(NA, 2),
       lwd = 2)

## ----asym-model-performance---------------------------------------------------
# Median probability model
selected_words <- result_asym$median.model

cat("Median Probability Model:\n")
cat("  Words selected:", length(selected_words), "\n")
cat("  Word indices (in reduced matrix):", paste(selected_words, collapse = ", "), "\n\n")

# Confusion matrix
true_positive <- sum(true_words %in% selected_words)
false_positive <- length(selected_words) - true_positive
false_negative <- length(true_words) - true_positive
true_negative <- p - length(true_words) - false_positive

cat("Classification Performance:\n")
cat("  True Positives:", true_positive, "/", length(true_words), "\n")
cat("  False Positives:", false_positive, "\n")
cat("  False Negatives:", false_negative, "\n")
cat("  Sensitivity:", round(true_positive / length(true_words), 3), "\n")
cat("  Precision:", 
    round(true_positive / max(1, length(selected_words)), 3), "\n")

## ----asym-coefficient-estimates-----------------------------------------------
cat("Coefficient Estimates for True Words:\n\n")

coef_table <- data.frame(
  Word_New = true_words,
  Word_Original = true_words_old[1:length(true_words)],
  True_Effect = true_effects,
  Post_Mean = round(result_asym$beta.mean[(1+true_words)], 3),
  Beta_WAM = round(result_asym$beta.wam[(1+true_words)], 3),
  MIP = round(result_asym$mip[true_words], 4),
  WMIP = round(result_asym$wmip[true_words], 4),
  Selected = ifelse(true_words %in% selected_words, "Yes", "")
)
print(coef_table, row.names = FALSE)

## ----asym-model-space, fig.width=7, fig.height=6------------------------------
# Sizes of visited models
model_sizes <- apply(result_asym$samples, 1, sum)

hist(
  model_sizes,
  breaks = 30,
  main = "Distribution of Model Sizes Visited",
  xlab = "Number of Words in Model",
  ylab = "Frequency",
  col = "lightblue",
  border = "white"
)
abline(v = length(true_words), col = "red", lwd = 3, lty = 2)
abline(v = median(model_sizes), col = "blue", lwd = 2, lty = 2)
legend("topleft",
       legend = c(
         paste("True size =", length(true_words)),
         paste("Median =", median(model_sizes))
       ),
       col = c("red", "blue"),
       lty = 2,
       lwd = 2)

cat("\nModel size summary:\n")
cat("  Mean:", round(mean(model_sizes), 2), "\n")
cat("  Median:", median(model_sizes), "\n")
cat("  Range:", range(model_sizes), "\n")

## ----tuning, eval=FALSE-------------------------------------------------------
# result <- geomc.vs(
#   X = X,
#   y = y,
#   initial = c(1,2),         # Initial model (the set of active variables)
#   n.iter = 100,        # Number of MCMC iterations
#   burnin = 2,           # Burn-in period (default: 1)
#   eps = 0.5,            # Perturbation parameter (default: 0.5)
#   symm = FALSE,          # Use symmetric proposals (default: TRUE)
#   move.prob = c(.3, .3, .4),  # Probabilities for add/delete/swap moves
#   lam0 = 0,             # Prior parameter (default: 0)
#   a0 = 0,               # Prior shape parameter (default: 0)
#   b0 = 0,               # Prior scale parameter (default: 0)
#   lam = n / p^2,        # Prior parameter (default: n/p^2)
#   w = sqrt(n) / p,      # Prior inclusion probability (default: sqrt(n)/p)
#   model.summary = TRUE, # Compute model summaries (default: FALSE)
#   model.threshold = 0.5, # Threshold for median model (default: 0.5)
#   show.progress = TRUE # Show progress during sampling
# )