rSDR_vignette

Introduction

This R Markdown document demonstrates the usage of the rSDR package for Robust Sufficient Dimension Reduction (rSDR) using alpha-distance covariance and Stiefel manifold learning for estimation (Hsin-Hsiung Huang, Feng Yu & Teng Zhang, 2024). We apply the method to ionosphere dataset, including cross-validation, bootstrap methods for optimal alpha selection, and visualization of results for both 2 and 3 dimensions.

Setup

#
# # Load rSDR package
# if (!requireNamespace("rSDR", quietly = TRUE)) {
#   if (!requireNamespace("devtools", quietly = TRUE)){   install.packages("devtools")
# }
#   library(devtools)
#   devtools::install_github("scchen1/rSDR")
#   # or
#
#   #a local zip file (`rSDR-main.zip`) downloaded from GitHub
#   #devtools::install_local(path='yourpath/rSDR-main.zip')
# }

Data Preparation

We use the ionosphere dataset from the fdm2id package. The dataset has 33 predictor variables (V1-V33) and one response variable (V35, ‘g’ for good or ‘b’ for bad).

The data include a response variable Y with nx1 matrix and predictors X with nxp matrix, where n is the number of observation and p is the number of variable.

#Load ionosphere data in fdm2id package
utils::data("ionosphere", package = "fdm2id")

X<-as.matrix(ionosphere[,c(1:33)])
Y<-ifelse(ionosphere[,34]=='b',0,1)
# Y will be used for rSDR
Y<-matrix(Y,length(Y),1)

#Y.factor will be used in plot_rSDR
Y.factor<-factor(ionosphere$V35,levels=c('b','g'),labels=c('Bad','Good'))

Robust Sufficient Dimension Reduction

Example: 3-Dimensional Reduction

set.seed(2435)
sdr_result<-rSDR(X=X, Y=Y, d=3, alpha=0.3,maxiter=1000,tol=1e-7)


# An optimal of C is obtained by maximizing the target function using
# ManifoldOptim method
head(sdr_result$C_value)
#>              [,1]       [,2]        [,3]
#> [1,]  0.477624035 0.62997157  0.47906752
#> [2,] -0.007745065 0.08908159  0.16982172
#> [3,]  0.098373416 0.07069420 -0.07903158
#> [4,] -0.198450270 0.21195128 -0.03610834
#> [5,] -0.027881462 0.05394432  0.05692702
#> [6,] -0.184002562 0.08140281  0.12003292

#The value of cost function f is equal to the negative of the target function.
sdr_result$f_value
#> [1] -0.06239673

#beta=sigma_x^(-0.5)%*%C
head(sdr_result$beta)
#>              [,1]        [,2]        [,3]
#> [1,]  2.116399672  2.23803342  1.50986045
#> [2,] -0.001570308 -0.21906430  0.25601573
#> [3,]  0.177081704  0.22080127 -0.33791989
#> [4,] -0.263345072  0.20829954 -0.07832153
#> [5,] -0.041349209  0.09978753 -0.09141928
#> [6,] -0.223808231  0.03036088  0.08491553

#projected_data: This projects X onto the estimated SDR directions (X %*% beta),
#the projected matrix (data) n x d
head(sdr_result$projected_data)
#>         V1       V2           V3
#> 1 1.755218 2.235456  1.736432496
#> 2 1.549962 2.969911  0.985671416
#> 3 1.706690 2.267038  1.724000571
#> 4 1.457957 3.882664 -0.004265498
#> 5 1.909327 2.291934  1.583547872
#> 6 2.043657 2.448369  1.305980189

Visualizing the 3D Projected Data

#Plot for 3-dimensional reduction using rSDR

plot_rSDR(projected_data=sdr_result$projected_data,Y=Y.factor,Y.name='group',colors=c("#374E55FF", "#DF8F44FF"))
Plot for 3-dimensional reduction using rSDR
Plot for 3-dimensional reduction using rSDR

Cross-Validation for Optimal Alpha (d=3)

# plan(multisession) will launch parallel workers running in the background
# to save running time. To shut down background workers launched this way, call
# plan(sequential)
# use all local cores except one
# future::plan(future::multisession, workers = future::availableCores() - 1)
# use 2 cores for parallel
future::plan("multisession", workers = 2)
set.seed(2435)
opt_results<-optimal_alpha_cv( alpha.v=c(0.3, 0.5, 0.7),X=X,Y=Y,d=3,kfolds=5)
#> This will take a few seconds or minutes to get the results.

# Optimal alpha using cross validation
opt_results$opt.alpha
#> [1] 0.3

# The cost function values by alpha (column) for each cross validation (row)
head(opt_results$f_test)
#>              0.3         0.5         0.7
#> [1,] -0.07676051 -0.06542722 -0.06576929
#> [2,] -0.06868129 -0.08072269 -0.07676208
#> [3,] -0.07367396 -0.06965283 -0.06994534
#> [4,] -0.09014696 -0.07462450 -0.08855503
#> [5,] -0.08849881 -0.09640043 -0.09481738

# The mean of cost function value by alpha
opt_results$f_test.mean
#>         0.3         0.5         0.7 
#> -0.07955230 -0.07736554 -0.07916983

# The standard deviation of cost function value by alpha
opt_results$f_test.sd
#>         0.3         0.5         0.7 
#> 0.009391681 0.012073941 0.012282007

# The reduced dimension
opt_results$d
#> [1] 3

# Number of folds
opt_results$kfolds
#> [1] 5
plot_alpha(opt_results=opt_results)
Mean and standard deviation of cost function (k-fold cross-validation method)
Mean and standard deviation of cost function (k-fold cross-validation method)

Bootstrap for Optimal Alpha (d=3)

# plan(multisession) will launch parallel workers running in the background
# to save running time. To shut down background workers launched this way, call
# plan(sequential)
# use all local cores except one
# future::plan(future::multisession, workers = future::availableCores() - 1)
# use 2 cores for parallel
future::plan("multisession", workers = 2)
set.seed(2435)
opt_results<-optimal_alpha_boot(alpha.v=c(0.3,0.5, 0.7),X=X,Y=Y,d=3,R=10)
#> This will take a few seconds or minutes to get the results.
#opt_results<-optimal_alpha_boot(alpha.v=c(0.3,0.4,0.5,0.6, 0.7),X=X,Y=Y,d=3,R=50)

# Optimal alpha using bootstrap method
opt_results$opt.alpha
#> [1] 0.5

# The cost function values by alpha (column) for R bootstrap replicates (row)
head(opt_results$f_test)
#>              0.3         0.5         0.7
#> [1,] -0.06818132 -0.06586804 -0.06604205
#> [2,] -0.06660301 -0.06328631 -0.06056598
#> [3,] -0.08728901 -0.08835463 -0.09099805
#> [4,] -0.07430642 -0.07331597 -0.07197419
#> [5,] -0.07300069 -0.06876824 -0.06554878
#> [6,] -0.06250337 -0.06668350 -0.07026673

# The mean of cost function value by alpha
opt_results$f_test.mean
#>         0.3         0.5         0.7 
#> -0.07045355 -0.07142157 -0.07001816

# The standard deviation of cost function value by alpha
opt_results$f_test.sd
#>         0.3         0.5         0.7 
#> 0.009578206 0.010000187 0.010852565

# The reduced dimension
opt_results$d
#> [1] 3

# Number of bootstrap replicates
opt_results$R
#> [1] 10
plot_alpha(opt_results=opt_results)
Mean and standard deviation of cost function (bootstrap method)
Mean and standard deviation of cost function (bootstrap method)

Example: 2-Dimensional Reduction and Plotting

set.seed(2435)
sdr_result<-rSDR(X=X, Y=Y, d=2, alpha=0.3,maxiter=1000,tol=1e-7)

# An optimal of C is obtained by maximizing the target function using ManifoldOptim method
head(sdr_result$C_value)
#>             [,1]        [,2]
#> [1,] -0.73621051 -0.08042223
#> [2,] -0.05058860  0.43921890
#> [3,]  0.02071660  0.08780910
#> [4,] -0.17383517  0.30821429
#> [5,] -0.06873169  0.12609213
#> [6,] -0.17728017  0.13615239

#The value of cost function f is equal to the negative of the target function.
sdr_result$f_value
#> [1] -0.05892882

#beta=sigma_x^(-0.5)%*%C
head(sdr_result$beta)
#>              [,1]       [,2]
#> [1,] -2.823607670 -0.8837528
#> [2,]  0.357104260  1.0101891
#> [3,]  0.254831740  0.2391243
#> [4,] -0.395722128  0.2684305
#> [5,]  0.008238173  0.3284309
#> [6,] -0.455105805 -0.3398876

#projected_data: This projects X onto the estimated SDR directions (X %*% beta), the projected matrix (data) n x d
head(sdr_result$projected_data)
#>          V1         V2
#> 1 -2.999827  0.3706711
#> 2 -2.364104  0.4028090
#> 3 -2.972651  0.4397894
#> 4 -3.742589 -1.3652387
#> 5 -2.856985  0.2290595
#> 6 -2.662073 -0.8883764
#Plot for 2-dimensional reduction using rSDR
plot_rSDR(projected_data=sdr_result$projected_data,Y=Y.factor,Y.name='group',colors=c("#374E55FF", "#DF8F44FF"))

Conclusion

This document tested the core functionality of the rSDR package, including:

The package successfully processes the ionosphere dataset, optimizes the alpha parameter, and produces meaningful visualizations of the reduced dimensions.

Reference

Hsin-Hsiung Huang, Feng Yu & Teng Zhang (19 Feb 2024): Robust sufficient dimension reduction via α-distance covariance, Journal of Nonparametric Statistics, DOI:10.1080/10485252.2024.2313137