## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>") ## ----warning=FALSE------------------------------------------------------------ # Load packages library(savvyGLM) library(MASS) library(glm2) library(CVXR) library(knitr) set.seed(123) n_val <- 500 p_val <- 25 rho_vals <- c(-0.75, -0.5, 0, 0.5, 0.75) mu_val <- 0 target_proportion <- 0.5 control_list <- list(maxit = 250, epsilon = 1e-6, trace = FALSE) family_type <- binomial(link = "logit") sigma.rho <- function(rho_val, p_val) { rho_val ^ abs(outer(1:p_val, 1:p_val, "-")) } theta_func <- function(p_val) { sgn <- rep(c(1, -1), length.out = p_val) mag <- ceiling(seq_len(p_val) / 2) sgn * mag } model_names <- c("OLS", "SR", "GSR", "St", "DSh", "LW", "QIS", "Sh") l2_matrix <- matrix(NA, nrow = length(model_names), ncol = length(rho_vals), dimnames = list(model_names, paste0("rho=", rho_vals))) for (j in seq_along(rho_vals)) { rho_val <- rho_vals[j] Sigma <- sigma.rho(rho_val, p_val) n_val_large <- n_val * 10 X_large <- mvrnorm(n_val_large, mu = rep(mu_val, p_val), Sigma = Sigma) X_large_intercept <- cbind(1, X_large) beta_true <- theta_func(p_val + 1) mu_y_large <- as.vector(1 / (1 + exp(-X_large_intercept %*% beta_true))) y_large <- rbinom(n = n_val_large, size = 1, prob = mu_y_large) y_zero_indices <- which(y_large == 0)[1:round(n_val * target_proportion)] y_one_indices <- which(y_large == 1)[1:(n_val - round(n_val * target_proportion))] final_indices <- c(y_zero_indices, y_one_indices) X_final <- X_large_intercept[final_indices, ] y_final <- y_large[final_indices] fit_ols <- glm.fit2(X_final, y_final, control = control_list, family = family_type) l2_matrix["OLS", j] <- norm(fit_ols$coefficients - beta_true, type = "2") for (m in model_names[-1]) { fit <- savvy_glm.fit2(X_final, y_final, model_class = m, control = control_list, family = family_type) l2_matrix[m, j] <- norm(fit$coefficients - beta_true, type = "2") } } ## ----warning=FALSE------------------------------------------------------------ l2_table <- as.data.frame(l2_matrix) kable(l2_table, digits = 4, caption = "L2 Distance Between Estimated and True Coefficients (Balanced LR)") ## ----warning=FALSE------------------------------------------------------------ # Load packages library(savvyGLM) library(MASS) library(glm2) library(CVXR) library(knitr) set.seed(123) n_val <- 500 p_val <- 25 rho_vals <- c(-0.75, -0.5, 0, 0.5, 0.75) mu_val <- 0 target_proportion <- 0.05 control_list <- list(maxit = 250, epsilon = 1e-6, trace = FALSE) family_type <- binomial(link = "logit") sigma.rho <- function(rho_val, p_val) { rho_val ^ abs(outer(1:p_val, 1:p_val, "-")) } theta_func <- function(p_val) { sgn <- rep(c(1, -1), length.out = p_val) mag <- ceiling(seq_len(p_val) / 2) sgn * mag } model_names <- c("OLS", "SR", "GSR", "St", "DSh", "LW", "QIS", "Sh") l2_matrix <- matrix(NA, nrow = length(model_names), ncol = length(rho_vals), dimnames = list(model_names, paste0("rho=", rho_vals))) for (j in seq_along(rho_vals)) { rho_val <- rho_vals[j] Sigma <- sigma.rho(rho_val, p_val) n_val_large <- n_val * 10 X_large <- mvrnorm(n_val_large, mu = rep(mu_val, p_val), Sigma = Sigma) X_large_intercept <- cbind(1, X_large) beta_true <- theta_func(p_val + 1) mu_y_large <- as.vector(1 / (1 + exp(-X_large_intercept %*% beta_true))) y_large <- rbinom(n = n_val_large, size = 1, prob = mu_y_large) y_zero_indices <- which(y_large == 0)[1:round(n_val * target_proportion)] y_one_indices <- which(y_large == 1)[1:(n_val - round(n_val * target_proportion))] final_indices <- c(y_zero_indices, y_one_indices) X_final <- X_large_intercept[final_indices, ] y_final <- y_large[final_indices] fit_ols <- glm.fit2(X_final, y_final, control = control_list, family = family_type) l2_matrix["OLS", j] <- norm(fit_ols$coefficients - beta_true, type = "2") for (m in model_names[-1]) { fit <- savvy_glm.fit2(X_final, y_final, model_class = m, control = control_list, family = family_type) l2_matrix[m, j] <- norm(fit$coefficients - beta_true, type = "2") } } ## ----warning=FALSE------------------------------------------------------------ l2_table <- as.data.frame(l2_matrix) kable(l2_table, digits = 4, caption = "L2 Distance Between Estimated and True Coefficients (Imbalanced LR)") ## ----warning=FALSE------------------------------------------------------------ # Load packages library(savvyGLM) library(MASS) library(glm2) library(CVXR) library(knitr) set.seed(123) n_val <- 500 p_val <- 25 rho_vals <- c(-0.75, -0.5, 0, 0.5, 0.75) mu_val <- 0 control_list <- list(maxit = 250, epsilon = 1e-6, trace = FALSE) family_type <- poisson(link = "log") sigma.rho <- function(rho_val, p_val) { rho_val ^ abs(outer(1:p_val, 1:p_val, "-")) } theta_func <- function(p_val) { base_increment <- 0.1 growth_rate <- 0.95 betas <- base_increment * (growth_rate ^ seq(from = 0, length.out = ceiling(p_val / 2))) betas <- rep(betas, each = 2)[1:p_val] signs <- rep(c(1, -1), length.out = p_val) betas <- betas * signs return(betas) } model_names <- c("OLS", "SR", "GSR", "St", "DSh", "LW", "QIS", "Sh") l2_matrix <- matrix(NA, nrow = length(model_names), ncol = length(rho_vals), dimnames = list(model_names, paste0("rho=", rho_vals))) for (j in seq_along(rho_vals)) { rho_val <- rho_vals[j] Sigma <- sigma.rho(rho_val, p_val) X <- mvrnorm(n_val, mu = rep(mu_val, p_val), Sigma = Sigma) X_intercept <- cbind(1, X) beta_true <- theta_func(p_val + 1) mu_y <- as.vector(exp(X_intercept %*% beta_true)) y <- rpois(n_val, lambda = mu_y) fit_ols <- glm.fit2(X_intercept, y, control = control_list, family = family_type) l2_matrix["OLS", j] <- norm(fit_ols$coefficients - beta_true, type = "2") for (m in model_names[-1]) { fit <- savvy_glm.fit2(X_intercept, y, model_class = m, control = control_list, family = family_type) l2_matrix[m, j] <- norm(fit$coefficients - beta_true, type = "2") } } ## ----warning=FALSE------------------------------------------------------------ l2_table <- as.data.frame(l2_matrix) kable(l2_table, digits = 4, caption = "L2 Distance Between Estimated and True Coefficients (log link for Poisson GLM)") ## ----warning=FALSE------------------------------------------------------------ # Load packages library(savvyGLM) library(MASS) library(glm2) library(CVXR) library(knitr) set.seed(123) n_val <- 500 p_val <- 25 rho_vals <- c(-0.75, -0.5, 0, 0.5, 0.75) mu_val <- 0 control_list <- list(maxit = 250, epsilon = 1e-6, trace = FALSE) family_type <- poisson(link = "sqrt") sigma.rho <- function(rho_val, p_val) { rho_val ^ abs(outer(1:p_val, 1:p_val, "-")) } theta_func <- function(p_val) { sgn <- rep(c(1, -1), length.out = p_val) mag <- ceiling(seq_len(p_val) / 2) sgn * mag } model_names <- c("OLS", "SR", "GSR", "St", "DSh", "LW", "QIS", "Sh") l2_matrix <- matrix(NA, nrow = length(model_names), ncol = length(rho_vals), dimnames = list(model_names, paste0("rho=", rho_vals))) for (j in seq_along(rho_vals)) { rho_val <- rho_vals[j] Sigma <- sigma.rho(rho_val, p_val) X <- mvrnorm(n_val, mu = rep(mu_val, p_val), Sigma = Sigma) X_intercept <- cbind(1, X) beta_true <- theta_func(p_val + 1) mu_y <- as.vector((X_intercept %*% beta_true)^2) y <- rpois(n_val, lambda = mu_y) fit_ols <- glm.fit2(X_intercept, y, control = control_list, family = family_type) l2_matrix["OLS", j] <- norm(fit_ols$coefficients - beta_true, type = "2") for (m in model_names[-1]) { fit <- savvy_glm.fit2(X_intercept, y, model_class = m, control = control_list, family = family_type) l2_matrix[m, j] <- norm(fit$coefficients - beta_true, type = "2") } } ## ----warning=FALSE------------------------------------------------------------ l2_table <- as.data.frame(l2_matrix) kable(l2_table, digits = 4, caption = "L2 Distance Between Estimated and True Coefficients (sqrt link for Poisson GLM)") ## ----warning=FALSE------------------------------------------------------------ # Load packages library(savvyGLM) library(MASS) library(glm2) library(CVXR) library(knitr) set.seed(123) n_val <- 500 p_val <- 25 rho_vals <- c(-0.75, -0.5, 0, 0.5, 0.75) mu_val <- 0 control_list <- list(maxit = 250, epsilon = 1e-6, trace = FALSE) family_type <- Gamma(link = "log") sigma.rho <- function(rho_val, p_val) { rho_val ^ abs(outer(1:p_val, 1:p_val, "-")) } theta_func <- function(p_val) { base_increment <- 0.1 growth_rate <- 0.95 betas <- base_increment * (growth_rate ^ seq(from = 0, length.out = ceiling(p_val / 2))) betas <- rep(betas, each = 2)[1:p_val] signs <- rep(c(1, -1), length.out = p_val) betas <- betas * signs return(betas) } model_names <- c("OLS", "SR", "GSR", "St", "DSh", "LW", "QIS", "Sh") l2_matrix <- matrix(NA, nrow = length(model_names), ncol = length(rho_vals), dimnames = list(model_names, paste0("rho=", rho_vals))) for (j in seq_along(rho_vals)) { rho_val <- rho_vals[j] Sigma <- sigma.rho(rho_val, p_val) X <- mvrnorm(n_val, mu = rep(mu_val, p_val), Sigma = Sigma) X_intercept <- cbind(1, X) beta_true <- theta_func(p_val + 1) mu_y <- as.vector(exp(X_intercept %*% beta_true)) y <- rgamma(n_val, shape = mu_y, scale = 1) retry_count <- 0 while(any(y <= 0) && retry_count < 100) { bad_idx <- which(y <= 0) y[bad_idx] <- rgamma(length(bad_idx), shape = mu_y[bad_idx], scale = 1) retry_count <- retry_count + 1 } fit_ols <- glm.fit2(X_intercept, y, control = control_list, family = family_type) l2_matrix["OLS", j] <- norm(fit_ols$coefficients - beta_true, type = "2") for (m in model_names[-1]) { fit <- savvy_glm.fit2(X_intercept, y, model_class = m, control = control_list, family = family_type) l2_matrix[m, j] <- norm(fit$coefficients - beta_true, type = "2") } } ## ----warning=FALSE------------------------------------------------------------ l2_table <- as.data.frame(l2_matrix) kable(l2_table, digits = 4, caption = "L2 Distance Between Estimated and True Coefficients (log link for Gamma GLM)") ## ----warning=FALSE------------------------------------------------------------ library(savvyGLM) library(MASS) library(glm2) library(knitr) set.seed(123) n_val <- 500 p_val <- 25 rho_vals <- c(-0.75, -0.5, 0, 0.5, 0.75) mu_val <- 0 control_list <- list(maxit = 250, epsilon = 1e-6, trace = FALSE) family_type <- Gamma(link = "sqrt") sigma.rho <- function(rho_val, p_val) { rho_val ^ abs(outer(1:p_val, 1:p_val, "-")) } theta_func <- function(p_val) { sgn <- rep(c(1, -1), length.out = p_val) mag <- ceiling(seq_len(p_val) / 2) sgn * mag } findStartingValues <- function(x, y, epsilon = 1e-6) { beta <- Variable(ncol(x)) eta <- x %*% beta objective <- Minimize(sum_squares(sqrt(y + 0.1) - eta)) constraints <- list(eta >= epsilon) problem <- Problem(objective, constraints) psolve(problem, silent = TRUE) starting_values <- as.numeric(value(beta)) return(starting_values) } model_names <- c("OLS", "SR", "GSR", "St", "DSh", "LW", "QIS", "Sh") l2_matrix <- matrix(NA, nrow = length(model_names), ncol = length(rho_vals), dimnames = list(model_names, paste0("rho=", rho_vals))) for (j in seq_along(rho_vals)) { rho_val <- rho_vals[j] Sigma <- sigma.rho(rho_val, p_val) X <- mvrnorm(n_val, mu = rep(mu_val, p_val), Sigma = Sigma) X_intercept <- cbind(1, X) beta_true <- theta_func(p_val + 1) mu_y <- as.vector((X_intercept %*% beta_true)^2) y <- rgamma(n_val, shape = mu_y, scale = 1) retry_count <- 0 while(any(y <= 0) && retry_count < 100) { bad_idx <- which(y <= 0) y[bad_idx] <- rgamma(length(bad_idx), shape = mu_y[bad_idx], scale = 1) retry_count <- retry_count + 1 } starting_values <- findStartingValues(X_intercept, y) fit_ols <- glm.fit2(X_intercept, y, start = starting_values, control = control_list, family = family_type) l2_matrix["OLS", j] <- norm(fit_ols$coefficients - beta_true, type = "2") for (m in model_names[-1]) { # Added use_parallel = FALSE to dramatically speed up vignette building fit <- savvy_glm.fit2(X_intercept, y, model_class = m, control = control_list, family = family_type, use_robust_start = TRUE, use_parallel = FALSE) l2_matrix[m, j] <- norm(fit$coefficients - beta_true, type = "2") } } ## ----warning=FALSE------------------------------------------------------------ l2_table <- as.data.frame(l2_matrix) kable(l2_table, digits = 4, caption = "L2 Distance Between Estimated and True Coefficients (sqrt link for Gamma GLM)") ## ----eval=FALSE, warning=FALSE------------------------------------------------ # # Load required packages # library(savvyGLM) # library(MASS) # library(glm2) # library(CVXR) # library(caret) # library(knitr) # # set.seed(1234) # years <- 2014:2015 # model_names <- c("OLS", "SR", "GSR", "St", "DSh") # N <- 10 # control_list <- list(maxit = 250, epsilon = 1e-6, trace = FALSE) # family_type <- Gamma(link = "log") # # Evaluation_results <- matrix(NA, nrow = length(model_names), ncol = length(years), # dimnames = list(model_names, years)) # # calculate_mse <- function(true_values, predicted_values) { # mean((true_values - predicted_values)^2) # } # # for (yr in years) { # cat("Processing year:", yr, "\n") # filename <- sprintf("FL_data_%d.csv", yr) # data_year <- read.csv(filename, header = TRUE, stringsAsFactors = FALSE) # # ratio_mat <- matrix(NA, nrow = N, ncol = length(model_names)-1) # colnames(ratio_mat) <- model_names[-1] # for (i in 1:N) { # set.seed(yr * 1000 + i) # train_index <- createDataPartition(data_year[, 1], p = 0.7, list = FALSE) # train_data <- data_year[train_index, ] # test_data <- data_year[-train_index, ] # # X_train <- as.matrix(train_data[, -1]) # y_train <- train_data[, 1] # X_test <- as.matrix(test_data[, -1]) # y_test <- test_data[, 1] # X_train_int <- cbind(1, X_train) # X_test_int <- cbind(1, X_test) # # model_glm2 <- glm.fit2(X_train_int, y_train, start = starting_values, # control = control_list, family = family_type) # y_pred_glm2 <- exp(X_test_int %*% model_glm2$coefficients) # mse_ols <- calculate_mse(y_test, y_pred_glm2) # for (m in model_names[-1]) { # model_savvy <- savvy_glm.fit2(X_train_int, y_train, model_class = m, control = control_list, # family = family_type, use_robust_start = TRUE) # y_pred <- exp(X_test_int %*% model_savvy$coefficients) # mse_savvy <- calculate_mse(y_test, y_pred) # ratio_mat[i, m] <- mse_ols / mse_savvy # } # } # avg_ratios <- c(1, colMeans(ratio_mat, na.rm = TRUE)) # Evaluation_results[, as.character(yr)] <- avg_ratios # } ## ----eval=FALSE--------------------------------------------------------------- # Evaluation_results_df <- as.data.frame(Evaluation_results) # kable(Evaluation_results_df, digits = 4, # caption = "Average MSE Ratio (OLS / Shrinkage) for Each Year (Gamma GLM with Log Link)") ## ----eval=FALSE, warning=FALSE------------------------------------------------ # # Load required packages # library(savvyGLM) # library(MASS) # library(glm2) # library(CVXR) # library(caret) # library(knitr) # # set.seed(1234) # years <- 2014:2023 # model_names <- c("OLS", "SR", "GSR", "St", "DSh") # N <- 100 # control_list <- list(maxit = 250, epsilon = 1e-6, trace = FALSE) # family_type <- Gamma(link = "sqrt") # # Evaluation_results <- matrix(NA, nrow = length(model_names), ncol = length(years), # dimnames = list(model_names, years)) # # calculate_mse <- function(true_values, predicted_values) { # mean((true_values - predicted_values)^2) # } # # for (yr in years) { # cat("Processing year:", yr, "\n") # filename <- sprintf("LA_data_%d.csv", yr) # data_year <- read.csv(filename, header = TRUE, stringsAsFactors = FALSE) # # ratio_mat <- matrix(NA, nrow = N, ncol = length(model_names)-1) # colnames(ratio_mat) <- model_names[-1] # for (i in 1:N) { # set.seed(yr * 1000 + i) # train_index <- createDataPartition(data_year[, 1], p = 0.7, list = FALSE) # train_data <- data_year[train_index, ] # test_data <- data_year[-train_index, ] # # X_train <- as.matrix(train_data[, -1]) # y_train <- train_data[, 1] # X_test <- as.matrix(test_data[, -1]) # y_test <- test_data[, 1] # X_train_int <- cbind(1, X_train) # X_test_int <- cbind(1, X_test) # # model_glm2 <- glm.fit2(X_train_int, y_train, start = starting_values, # control = control_list, family = family_type) # y_pred_glm2 <- (X_test_int %*% model_glm2$coefficients)^2 # mse_ols <- calculate_mse(y_test, y_pred_glm2) # for (m in model_names[-1]) { # model_savvy <- savvy_glm.fit2(X_train_int, y_train, model_class = m, control = control_list, # family = family_type, use_robust_start = TRUE) # y_pred <- exp(X_test_int %*% model_savvy$coefficients) # mse_savvy <- calculate_mse(y_test, y_pred) # ratio_mat[i, m] <- mse_ols / mse_savvy # } # } # avg_ratios <- c(1, colMeans(ratio_mat, na.rm = TRUE)) # Evaluation_results[, as.character(yr)] <- avg_ratios # } ## ----eval=FALSE--------------------------------------------------------------- # Evaluation_results_df <- as.data.frame(Evaluation_results) # kable(Evaluation_results_df, digits = 4, # caption = "Average MSE Ratio (OLS / Shrinkage) for Each Year (Gamma GLM with Log Link)")