## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 5, dev = "svglite", fig.ext = "svg" ) library(corrselect) ## ----------------------------------------------------------------------------- data(bioclim_example) predictors <- bioclim_example[, -1] # Exclude response response <- bioclim_example[, 1] cat("Variables:", ncol(predictors), "\n") cat("Observations:", nrow(predictors), "\n") cat("Response: species_richness (continuous)\n") ## ----fig.width=8, fig.height=6, fig.alt="Correlation heatmap of 19 bioclimatic variables displayed as a color-coded matrix. Blue indicates negative correlations, white indicates near-zero correlations, and red indicates positive correlations. Numerical correlation values are overlaid on each cell. The heatmap reveals block structure with correlations ranging from -0.15 to 0.97, showing strong correlations among temperature-related variables and precipitation-related variables."---- cor_matrix <- cor(predictors) # Correlation heatmap col_pal <- colorRampPalette(c("#3B4992", "white", "#EE0000"))(100) par(mar = c(1, 1, 3, 1)) nc <- ncol(cor_matrix) nr <- nrow(cor_matrix) image(seq_len(nc), seq_len(nr), t(cor_matrix[nr:1, ]), col = col_pal, xlab = "", ylab = "", axes = FALSE, main = "Bioclimatic Variable Correlations (p = 19)", zlim = c(-1, 1)) axis(1, at = seq_len(nc), labels = colnames(cor_matrix), las = 2, cex.axis = 0.7) axis(2, at = nc:1, labels = colnames(cor_matrix), las = 2, cex.axis = 0.7) for (i in seq_len(nc)) { for (j in seq_len(nr)) { text_col <- if (abs(cor_matrix[j, i]) > 0.6) "white" else "black" text(i, nr - j + 1, sprintf("%.2f", cor_matrix[j, i]), cex = 0.5, col = text_col) } } ## ----------------------------------------------------------------------------- if (requireNamespace("caret", quietly = TRUE)) { # Apply caret's greedy algorithm to_remove_caret <- caret::findCorrelation(cor_matrix, cutoff = 0.7) result_caret <- predictors[, -to_remove_caret] cat("caret results:\n") cat(" Variables retained:", ncol(result_caret), "\n") cat(" Variables removed:", length(to_remove_caret), "\n") cat(" Removed:", paste(colnames(predictors)[to_remove_caret], collapse = ", "), "\n") } ## ----------------------------------------------------------------------------- # Apply corrselect (exact mode) result_corrselect <- corrPrune(predictors, threshold = 0.7, mode = "exact") cat("\ncorrselect results:\n") cat(" Variables retained:", ncol(result_corrselect), "\n") cat(" Variables removed:", length(attr(result_corrselect, "removed_vars")), "\n") cat(" Removed:", paste(attr(result_corrselect, "removed_vars"), collapse = ", "), "\n") ## ----fig.width=7, fig.height=5, fig.alt="Overlaid histogram comparing absolute correlation distributions across three methods: original data (gray bars), caret's findCorrelation (red bars), and corrselect (blue bars). Black vertical dashed line marks the 0.7 threshold. All methods successfully reduce correlations below the threshold, but corrselect retains more variables than caret while still satisfying the constraint, demonstrating the advantage of maximal clique enumeration over greedy removal."---- # Extract correlations cor_orig <- cor(predictors) cor_corrselect <- cor(result_corrselect) if (requireNamespace("caret", quietly = TRUE)) { cor_caret <- cor(result_caret) # Overlaid histogram comparing all three hist(abs(cor_orig[upper.tri(cor_orig)]), breaks = 30, main = "Distribution of Absolute Correlations", xlab = "Absolute Correlation", col = rgb(0.5, 0.5, 0.5, 0.4), xlim = c(0, 1)) hist(abs(cor_caret[upper.tri(cor_caret)]), breaks = 30, col = rgb(0.8, 0.2, 0.2, 0.4), add = TRUE) hist(abs(cor_corrselect[upper.tri(cor_corrselect)]), breaks = 30, col = rgb(0.2, 0.5, 0.8, 0.4), add = TRUE) abline(v = 0.7, col = "black", lwd = 2, lty = 2) legend("topright", legend = c( paste0("Original (", ncol(predictors), " vars)"), paste0("caret (", ncol(result_caret), " vars)"), paste0("corrselect (", ncol(result_corrselect), " vars)"), "Threshold" ), fill = c( rgb(0.5, 0.5, 0.5, 0.4), rgb(0.8, 0.2, 0.2, 0.4), rgb(0.2, 0.5, 0.8, 0.4), NA ), border = c("white", "white", "white", NA), lty = c(NA, NA, NA, 2), lwd = c(NA, NA, NA, 2), col = c(NA, NA, NA, "black"), bty = "o", bg = "white") } ## ----eval=requireNamespace("Boruta", quietly=TRUE)---------------------------- if (requireNamespace("Boruta", quietly = TRUE)) { # Boruta: "Which variables predict species_richness?" set.seed(123) boruta_result <- Boruta::Boruta( species_richness ~ ., data = bioclim_example, maxRuns = 100 ) cat("Boruta variable importance screening:\n") print(table(boruta_result$finalDecision)) important_vars <- names(boruta_result$finalDecision[ boruta_result$finalDecision == "Confirmed" ]) cat("\n Confirmed predictors:", length(important_vars), "\n") cat(" ", paste(important_vars, collapse = ", "), "\n") } ## ----------------------------------------------------------------------------- # corrselect: "Which variables are redundant?" corrselect_result <- corrPrune(predictors, threshold = 0.7) cat("\ncorrselect multicollinearity pruning:\n") cat(" Non-redundant variables:", ncol(corrselect_result), "\n") cat(" ", paste(names(corrselect_result), collapse = ", "), "\n") ## ----eval=FALSE--------------------------------------------------------------- # # Stage 1: Correlation-based pruning # data_pruned <- corrPrune(raw_data, threshold = 0.7) # # # Stage 2: Importance testing # boruta_result <- Boruta::Boruta(response ~ ., data = cbind(response, data_pruned)) # final_vars <- names(boruta_result$finalDecision[ # boruta_result$finalDecision == "Confirmed" # ]) # # # Stage 3: Final model # final_model <- lm(response ~ ., data = cbind(response, data_pruned)[, c("response", final_vars)]) ## ----eval=requireNamespace("glmnet", quietly=TRUE)---------------------------- if (requireNamespace("glmnet", quietly = TRUE)) { # Fit LASSO with cross-validation X <- as.matrix(predictors) y <- response set.seed(123) cv_lasso <- glmnet::cv.glmnet(X, y, alpha = 1) # Extract non-zero coefficients at lambda.1se (conservative choice) coef_lasso <- stats::coef(cv_lasso, s = "lambda.1se") selected_lasso <- rownames(coef_lasso)[coef_lasso[, 1] != 0][-1] # Remove intercept cat("glmnet (LASSO, λ = lambda.1se):\n") cat(" Variables retained:", length(selected_lasso), "\n") cat(" ", paste(selected_lasso, collapse = ", "), "\n") } ## ----eval=requireNamespace("glmnet", quietly=TRUE)---------------------------- if (requireNamespace("glmnet", quietly = TRUE)) { # Compare model performance model_glmnet <- lm(species_richness ~ ., data = bioclim_example[, c("species_richness", selected_lasso)]) model_corrselect <- lm(species_richness ~ ., data = cbind(species_richness = response, result_corrselect)) cat("\nModel comparison (OLS on selected variables):\n") cat(" glmnet: R² =", round(summary(model_glmnet)$r.squared, 3), "with", length(selected_lasso), "predictors\n") cat(" corrselect: R² =", round(summary(model_corrselect)$r.squared, 3), "with", ncol(result_corrselect), "predictors\n") } ## ----eval=requireNamespace("glmnet", quietly=TRUE), fig.width=10, fig.height=5, fig.alt="Side-by-side barplots comparing coefficient magnitudes between glmnet (left panel, salmon bars) and corrselect (right panel, blue bars). Left panel shows glmnet's shrunk coefficients affected by L1 penalty, biased toward zero. Right panel shows corrselect's unbiased OLS coefficients on pruned variables with preserved effect sizes. The comparison illustrates the tradeoff between prediction-focused shrinkage and interpretation-focused hard selection."---- if (requireNamespace("glmnet", quietly = TRUE)) { par(mfrow = c(1, 2), mar = c(8, 4, 3, 2)) # glmnet coefficients (shrinkage) coef_vals <- coef_lasso[coef_lasso[, 1] != 0, ][-1] barplot(sort(abs(coef_vals), decreasing = TRUE), las = 2, main = "glmnet: Shrunk Coefficients", ylab = "Absolute Coefficient Value", col = "salmon", cex.names = 0.7) # corrselect: unbiased OLS coefficients coef_corrselect <- coef(model_corrselect)[-1] # Remove intercept barplot(sort(abs(coef_corrselect), decreasing = TRUE), las = 2, main = "corrselect: Unbiased OLS Coefficients", ylab = "Absolute Coefficient Value", col = rgb(0.2, 0.5, 0.8, 0.7), cex.names = 0.7) } ## ----------------------------------------------------------------------------- # Manual iterative VIF removal manual_vif_removal <- function(formula, data, threshold = 5, max_iter = 10) { require(car) # Get response variable name response_var <- all.vars(formula)[1] # Get predictor names (handles ~ . notation) model <- lm(formula, data = data) current_vars <- names(coef(model))[-1] # Exclude intercept removed_vars <- character(0) vif_vals <- car::vif(model) while (max(vif_vals) > threshold && length(current_vars) > 1 && length(removed_vars) < max_iter) { # Remove variable with highest VIF var_to_remove <- names(which.max(vif_vals)) removed_vars <- c(removed_vars, var_to_remove) cat("Iteration", length(removed_vars), ": Removing", var_to_remove, "(VIF =", round(max(vif_vals), 2), ")\n") # Update variable list and refit current_vars <- setdiff(current_vars, var_to_remove) new_formula <- as.formula(paste(response_var, "~", paste(current_vars, collapse = " + "))) model <- lm(new_formula, data = data) vif_vals <- car::vif(model) } list(model = model, iterations = length(removed_vars), vif = vif_vals, removed = removed_vars, converged = max(vif_vals) <= threshold) } # Run manual VIF removal if (requireNamespace("car", quietly = TRUE)) { cat("Manual VIF removal (iterative):\n") manual_result <- manual_vif_removal(species_richness ~ ., data = bioclim_example, threshold = 5) cat("\nVariables kept:", length(manual_result$vif), "\n") if (!manual_result$converged) { cat("(Stopped at max_iter = 10; VIF threshold not yet reached)\n") } } ## ----------------------------------------------------------------------------- # Run modelPrune modelprune_result <- modelPrune(species_richness ~ ., data = bioclim_example, limit = 5) cat("\nmodelPrune results:\n") cat("Variables removed:", attr(modelprune_result, "removed_vars"), "\n") cat("Variables kept:", length(attr(modelprune_result, "selected_vars")), "\n") # Extract final model final_model <- attr(modelprune_result, "final_model") if (requireNamespace("car", quietly = TRUE)) { cat("\nFinal VIF values:\n") print(round(car::vif(final_model), 2)) } ## ----fig.width=8, fig.height=5, fig.alt="Side-by-side barplot showing VIF values before (red bars) and after (blue bars) applying modelPrune() for the top 15 variables ordered by initial VIF. Black horizontal dashed line marks the VIF limit of 5. Before pruning, many variables show high VIF values indicating severe multicollinearity. After modelPrune(), all retained variables have VIF below the threshold, and high-VIF variables are completely removed (shown as red-only bars), demonstrating automated and effective multicollinearity reduction."---- if (requireNamespace("car", quietly = TRUE)) { # Compute VIF for original model model_full <- lm(species_richness ~ ., data = bioclim_example) vif_before <- car::vif(model_full) # VIF after modelPrune vif_after <- car::vif(final_model) # Combined barplot par(mar = c(8, 4, 4, 2)) all_vars <- unique(c(names(vif_before), names(vif_after))) vif_combined <- data.frame( before = vif_before[match(all_vars, names(vif_before))], after = vif_after[match(all_vars, names(vif_after))] ) vif_combined[is.na(vif_combined)] <- 0 vif_combined <- vif_combined[order(vif_combined$before, decreasing = TRUE), ] # Show top 15 n_show <- min(15, nrow(vif_combined)) barplot(t(as.matrix(vif_combined[1:n_show, ])), beside = TRUE, las = 2, main = "VIF Before and After modelPrune()", ylab = "VIF", col = c(rgb(0.8, 0.2, 0.2, 0.7), rgb(0.2, 0.5, 0.8, 0.7)), cex.names = 0.6, names.arg = rownames(vif_combined)[1:n_show]) abline(h = 5, col = "black", lwd = 2, lty = 2) legend("topright", legend = c("Before", "After", "Limit = 5"), fill = c(rgb(0.8, 0.2, 0.2, 0.7), rgb(0.2, 0.5, 0.8, 0.7), NA), border = c("white", "white", NA), lty = c(NA, NA, 2), lwd = c(NA, NA, 2), col = c(NA, NA, "black"), bty = "o", bg = "white") } ## ----eval=FALSE--------------------------------------------------------------- # # Correlation pruning # data_pruned <- corrPrune(raw_data, threshold = 0.7) # # # VIF refinement # model_data <- modelPrune(response ~ ., data = data_pruned, limit = 5) # # # Importance testing (optional) # if (requireNamespace("Boruta", quietly = TRUE)) { # boruta_result <- Boruta::Boruta(response ~ ., data = model_data) # important_vars <- names(boruta_result$finalDecision[ # boruta_result$finalDecision == "Confirmed" # ]) # } # # # Final model: OLS (interpretable) or glmnet (prediction) # final_model <- lm(response ~ ., data = model_data[, c("response", important_vars)]) ## ----------------------------------------------------------------------------- sessionInfo()