---
title: "Cross-Validation and Advanced Features in missoNet"
author: "Yixiao Zeng, Celia M. T. Greenwood"
date: "`r Sys.Date()`"
output: 
  rmarkdown::html_vignette:
    toc: true
    toc_depth: 3
    number_sections: true
    fig_width: 7
    fig_height: 5
vignette: >
  %\VignetteIndexEntry{Cross-Validation and Advanced Features in missoNet}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  fig.align = "center",
  out.width = "90%",
  dpi = 90,
  message = FALSE,
  warning = FALSE
)
options(width = 100)
```

# Introduction

This vignette covers advanced features of **missoNet**:

* Cross-validation for optimal parameter selection
* 1-SE rule for enhanced regularization
* Parallel processing for large-scale problems
* Adaptive search strategies
* Convergence diagnostics and troubleshooting

```{r}
library(missoNet)
```

# Cross-Validation Framework

## Basic Cross-Validation

Cross-validation provides data-driven selection of regularization parameters:

```{r}
# Generate example data
sim <- generateData(n = 300, p = 30, q = 12, rho = 0.1, 
                    missing.type = "MCAR")

# 5-fold cross-validation
cvfit <- cv.missoNet(
  X = sim$X,
  Y = sim$Z,
  kfold = 5,
  compute.1se = TRUE,  # Also compute 1-SE models
  seed = 123,          # For reproducibility
  verbose = 1
)


# ====== Cross-Validation Summary ======
{
  cat("\nNumber of folds:", cvfit$kfold, "\n")
  cat("Lambda pairs evaluated:", length(cvfit$param_set$cv.errors.mean), "\n")
  cat("Minimum CV error:", round(min(cvfit$param_set$cv.errors.mean), 4), "\n")
}

# ====== Optimal parameters (min CV error) ======
{
  cat("\nlambda.beta:", cvfit$est.min$lambda.beta, "\n")
  cat("lambda.theta:", cvfit$est.min$lambda.theta, "\n")
}

# ====== 1-SE parameters ======
{
  cat("\nlambda.beta:", cvfit$est.1se.beta$lambda.beta, "\n")
  cat("lambda.theta:", cvfit$est.1se.theta$lambda.theta, "\n")
}
```

## Understanding the 1-SE Rule

The 1-SE rule selects simpler models within one standard error of minimum:

```{r}
# Compare three models
models <- list(
  "Minimum CV" = cvfit$est.min,
  "1SE Beta" = cvfit$est.1se.beta,
  "1SE Theta" = cvfit$est.1se.theta
)

# Create comparison table
model_comparison <- data.frame(
  Model = names(models),
  Lambda.Beta = numeric(3),
  Lambda.Theta = numeric(3),
  Nonzero.Coef = numeric(3),
  Network.Edges = numeric(3)
)

for (i in 1:3) {
  if (!is.null(models[[i]])) {
    model_comparison$Lambda.Beta[i] <- models[[i]]$lambda.beta
    model_comparison$Lambda.Theta[i] <- models[[i]]$lambda.theta
    model_comparison$Nonzero.Coef[i] <- sum(abs(models[[i]]$Beta) > 1e-8)
    model_comparison$Network.Edges[i] <- 
      sum(abs(models[[i]]$Theta[upper.tri(models[[i]]$Theta)]) > 1e-8)
  }
}

print(model_comparison, digits = 4)
```

## Visualizing CV Results

```{r}
# Heatmap of CV errors
plot(cvfit, type = "heatmap", detailed.axes = FALSE)
```

```{r}
# 3D surface plot
plot(cvfit, type = "scatter", plt.surf = TRUE)
```

## CV Error Analysis

```{r}
# Extract and analyze CV errors
cv_errors <- cvfit$param_set$cv.errors.mean
cv_errors_sd <- (cvfit$param_set$cv.errors.upper - cvfit$param_set$cv.errors.lower) / 2

# Create error surface data
lambda_beta_unique <- unique(cvfit$param_set$cv.grid.beta)
lambda_theta_unique <- unique(cvfit$param_set$cv.grid.theta)

# Find optimal region
min_error <- min(cv_errors)
near_optimal <- which(cv_errors <= min_error * 1.05)  # Within 5% of minimum

# ====== Regularization path statistics ======
{
  cat("\n  Error range: [", round(min(cv_errors), 4), ", ", 
      round(max(cv_errors), 4), "]\n", sep = "")
  cat("  Models within 5% of minimum:", length(near_optimal), "\n")
  cat("  This suggests", ifelse(length(near_optimal) > 10, "stable", "sensitive"), 
      "parameter selection\n")
}
```

# Parallel Processing

## Setting Up Parallel CV

Parallel processing is most beneficial for:

* Large datasets (max(p, q) > 100)
* Many folds (k > 5)
* Dense lambda grids

```{r, eval = FALSE}
library(parallel)

# Detect available cores
n_cores <- detectCores()
cat("Available cores:", n_cores, "\n")

# Create cluster, use 2 cores for illustration
cl <- makeCluster(min(n_cores - 1, 2))

# Run parallel cross-validation
cvfit_parallel <- cv.missoNet(
  X = sim$X,
  Y = sim$Z,
  kfold = 5,
  parallel = TRUE,
  cl = cl,
  verbose = 1
)

# Clean up
stopCluster(cl)
```

# Advanced Parameter Tuning

## Adaptive Grid Search

```{r}
# Standard grid search
time_standard <- system.time({
  fit_standard <- missoNet(
    X = sim$X,
    Y = sim$Z,
    adaptive.search = FALSE,
    verbose = 0
  )
})

# Adaptive search (faster)
time_adaptive <- system.time({
  fit_adaptive <- missoNet(
    X = sim$X,
    Y = sim$Z,
    adaptive.search = TRUE,
    verbose = 0
  )
})


# ====== Computation time comparison ======
{
  cat("\n  Standard search:", round(time_standard[3], 2), "seconds\n")
  cat("  Adaptive search:", round(time_adaptive[3], 2), "seconds\n")
  cat("  Speedup:", round(time_standard[3] / time_adaptive[3], 1), "x\n")
}
```

# Custom Penalty Factors

```{r}
p <- ncol(sim$X)
q <- ncol(sim$Z)

# Scenario: Prior knowledge about important predictors
# Group 1: Known important (predictors 1-10) - light penalty
# Group 2: Possibly important (predictors 11-30) - moderate penalty  
# Group 3: Likely unimportant (predictors 31-60) - heavy penalty

beta.pen.factor <- matrix(1, p, q)
beta.pen.factor[1:10, ] <- 0.1      # Light penalty
beta.pen.factor[11:20, ] <- 1.0     # Standard penalty
beta.pen.factor[21:p, ] <- 10.0     # Heavy penalty

# Network: Encourage block structure
theta.pen.factor <- matrix(1, q, q)
# Block 1: responses 1-6
theta.pen.factor[1:6, 1:6] <- 0.5
# Block 2: responses 7-12
theta.pen.factor[7:12, 7:12] <- 0.5
# Between blocks: heavy penalty
theta.pen.factor[1:6, 7:12] <- 2.0
theta.pen.factor[7:12, 1:6] <- 2.0

# Fit with structured penalties
fit_structured <- missoNet(
  X = sim$X,
  Y = sim$Z,
  beta.pen.factor = beta.pen.factor,
  theta.pen.factor = theta.pen.factor,
  verbose = 0
)

# Analyze selection pattern
selected_groups <- c(
  Group1 = sum(rowSums(abs(fit_structured$est.min$Beta[1:10, ])) > 1e-8),
  Group2 = sum(rowSums(abs(fit_structured$est.min$Beta[11:20, ])) > 1e-8),
  Group3 = sum(rowSums(abs(fit_structured$est.min$Beta[21:p, ])) > 1e-8)
)


# Penalty factors successfully guided selection toward prior knowledge
# Selected predictors by group:
print(selected_groups)
```

# Multi-Resolution Lambda Grids

```{r, eval = FALSE}
# Create focused grid around expected optimal region
# Based on preliminary analysis or domain knowledge

# Log-scale grid with concentration
create_focused_grid <- function(center, width, n_points) {
  log_center <- log10(center)
  log_seq <- seq(log_center - width/2, log_center + width/2, 
                 length.out = n_points)
  return(10^log_seq)
}

# Focused grids based on problem characteristics
lambda.beta.focused <- create_focused_grid(
  center = 0.05,   # Expected optimal region
  width = 1.5,     # Log10 scale width
  n_points = 30
)

lambda.theta.focused <- create_focused_grid(
  center = 0.01,
  width = 1.0,
  n_points = 30
)

# Fit with focused grid
fit_focused <- missoNet(
  X = sim$X,
  Y = sim$Z,
  lambda.beta = lambda.beta.focused,
  lambda.theta = lambda.theta.focused
)
```

# Convergence Diagnostics

## Monitoring Convergence

```{r}
# Fit with detailed verbosity
fit_verbose <- missoNet(
  X = sim$X,
  Y = sim$Z,
  lambda.beta = 0.05,
  lambda.theta = 0.01,
  verbose = 2  # Detailed output
)


# Check convergence information
if (!is.null(fit_verbose$est.min$converged)) {
  cat("\nConvergence Status:\n")
  cat("  Beta optimization:", 
      ifelse(fit_verbose$est.min$converged$Beta, "✓ Converged", "✗ Not converged"), "\n")
  cat("  Theta optimization:", 
      ifelse(fit_verbose$est.min$converged$Theta, "✓ Converged", "✗ Not converged"), "\n")
}


if (!is.null(fit_verbose$est.min$iterations)) {
  cat("\nIterations used:\n")
  cat("  Beta:", fit_verbose$est.min$iterations$Beta, "\n")
  cat("  Theta:", fit_verbose$est.min$iterations$Theta, "\n")
}
```

## Handling Convergence Issues

Strategies for difficult problems:

* Relax convergence tolerances
* Increase maximum iterations
* Use adaptive search for better initialization

```{r, eval = FALSE}
# Simulate a difficult problem
sim_difficult <- generateData(
  n = 100,
  p = 150,  # p > n (high-dimensional)
  q = 20,
  rho = 0.25,  # Higher missing rate
  missing.type = "MAR"
)

# Strategy 1: Relax tolerances
fit_relaxed <- missoNet(
  X = sim_difficult$X,
  Y = sim_difficult$Z,
  beta.tol = 1e-3,
  theta.tol = 1e-3,
  verbose = 0
)

# Strategy 2: Increase iterations
fit_more_iter <- missoNet(
  X = sim_difficult$X,
  Y = sim_difficult$Z,
  beta.max.iter = 10000,
  theta.max.iter = 10000,
  verbose = 0
)

# Strategy 3: Use adaptive search for better initialization
fit_adaptive_init <- missoNet(
  X = sim_difficult$X,
  Y = sim_difficult$Z,
  adaptive.search = TRUE,
  verbose = 0
)
```

# Relaxed Network Estimation

## De-biased Precision Matrix

```{r}
# Fit with relaxed network option
fit_relax <- missoNet(
  X = sim$X,
  Y = sim$Z,
  relax.net = TRUE,  # Refit network without penalty
  verbose = 0
)

# Compare penalized vs relaxed estimates
if (!is.null(fit_relax$est.min$relax.net)) {
  Theta_pen <- fit_relax$est.min$Theta
  Theta_relax <- fit_relax$est.min$relax.net
  
  # Extract edges
  edges_pen <- Theta_pen[upper.tri(Theta_pen)]
  edges_relax <- Theta_relax[upper.tri(Theta_relax)]
  
  # Select non-zero edges from penalized model
  active_edges <- abs(edges_pen) > 1e-8
  
  # === Relaxed network estimation ===
  cat("\n  Active edges:", sum(active_edges), "\n")
  cat("  Mean |edge| (penalized):", 
      round(mean(abs(edges_pen[active_edges])), 4), "\n")
  cat("  Mean |edge| (relaxed):", 
      round(mean(abs(edges_relax[active_edges])), 4), "\n")
  cat("  Relaxed estimates are typically larger (less shrinkage)\n")
}
```

# Model Selection Strategies

## Nested Cross-Validation

```{r}
# Nested CV for unbiased performance estimation
# Outer loop: Performance evaluation
# Inner loop: Parameter selection

evaluate_nested_cv <- function(X, Y, outer_folds = 5, inner_folds = 5) {
  n <- nrow(X)
  outer_fold_ids <- rep(1:outer_folds, length.out = n)
  outer_fold_ids <- sample(outer_fold_ids)
  
  test_errors <- numeric(outer_folds)
  
  for (k in 1:outer_folds) {
    # Split data
    test_idx <- which(outer_fold_ids == k)
    train_idx <- setdiff(1:n, test_idx)
    
    # Inner CV for parameter selection
    cvfit_inner <- cv.missoNet(
      X = X[train_idx, ],
      Y = Y[train_idx, ],
      kfold = inner_folds,
      verbose = 0
    )
    
    # Predict on test fold
    y_pred <- predict(cvfit_inner, newx = X[test_idx, ])
    y_true <- Y[test_idx, ]
    
    # Calculate error (ignoring missing values)
    mask <- !is.na(y_true)
    test_errors[k] <- mean((y_pred[mask] - y_true[mask])^2)
  }
  
  return(list(
    mean_error = mean(test_errors),
    se_error = sd(test_errors) / sqrt(outer_folds),
    fold_errors = test_errors
  ))
}

# Run nested CV
nested_results <- evaluate_nested_cv(
  sim$X, 
  sim$Z,
  outer_folds = 3,
  inner_folds = 3
)

{
  cat("\nNested Cross-Validation Results:\n")
  cat("  Mean test error:", round(nested_results$mean_error, 4), "\n")
  cat("  Standard error:", round(nested_results$se_error, 4), "\n")
  cat("  95% CI: [", 
      round(nested_results$mean_error - 1.96 * nested_results$se_error, 4), ", ",
      round(nested_results$mean_error + 1.96 * nested_results$se_error, 4), "]\n", sep = "")
}
```

# Best Practices Summary

## Recommended Workflow

```{r, echo = FALSE, include = TRUE}
{
  cat("=== Recommended missoNet Workflow ===\n\n")
  
  cat("1. Data Preparation:\n")
  cat("   - Check missing patterns\n")
  cat("   - Consider QC (usually recommended)\n")
  cat("   - Assess dimensionality (p vs n)\n\n")
  
  cat("2. Initial Exploration:\n")
  cat("   - Quick fit with adaptive.search = TRUE\n")
  cat("   - Examine selected lambda ranges\n")
  cat("   - Check convergence\n\n")
  
  cat("3. Cross-Validation:\n")
  cat("   - Use 5-10 folds depending on sample size\n")
  cat("   - Enable compute.1se = TRUE\n")
  cat("   - Consider parallel processing for large data\n\n")
  
  cat("4. Model Selection:\n")
  cat("   - Use eBIC for high-dimensional settings\n")
  cat("   - Consider 1-SE models for better generalization\n")
  cat("   - Validate on independent test set if available\n\n")
  
  cat("5. Diagnostics:\n")
  cat("   - Check convergence status\n")
  cat("   - Examine sparsity patterns\n")
  cat("   - Validate predictions\n")
}
```

## Performance Tips

```{r, echo = FALSE, include = TRUE}
# Create performance recommendation table
tips <- data.frame(
  Scenario = c(
    "Large n (>1000)",
    "Large p (>500)", 
    "Large q (>50)",
    "High missing (>30%)",
    "Dense solutions expected"
  ),
  Recommendation = c(
    "Use parallel processing, adaptive search",
    "Reduce n.lambda.beta, use eBIC",
    "Reduce n.lambda.theta, consider diagonal penalty",
    "Increase tolerances, check MAR assumption",
    "Reduce lambda ranges, focus grid"
  )
)

print(tips, right = FALSE)
```