Introduction to roclab

library(roclab)

Overview

The roclab package implements ROC-Optimizing Binary Classifiers, supporting both linear and kernel models.

For large-scale data, the model is computationally prohibitive because its loss is a U-statistic involving a double summation. To reduce this burden, the package adopts an efficient algorithm based on an incomplete U-statistic, which approximates the loss with a single summation. In kernel models, a Nyström low-rank approximation is further applied to efficiently compute the kernel matrix. These approximations substantially reduce computational cost and accelerate training, while maintaining accuracy, making ROC-Optimizing Binary Classifiers practical for large-scale datasets.

In addition, efficient optimization is performed using Gradient Descent with the Adamax update rule —a variant of Adam, based on the infinity norm—for linear models with the ridge penalty and kernel models. For linear models with other penalties (i.e., those involving variable selection), Proximal Gradient Descent with an Adamax adaptive learning rate scheme is employed.

Example: Linear Model

set.seed(123)
n_lin <- 1500
n_pos_lin <- round(0.2 * n_lin)
n_neg_lin <- n_lin - n_pos_lin

X_train_lin <- rbind(
  matrix(rnorm(2 * n_neg_lin, mean = -1), ncol = 2),
  matrix(rnorm(2 * n_pos_lin, mean =  1), ncol = 2)
)
y_train_lin <- c(rep(-1, n_neg_lin), rep(1, n_pos_lin))

# Fit a linear model
fit_lin <- roclearn(X_train_lin, y_train_lin, lambda = 0.1)

# Summary
summary(fit_lin)
#> Linear Model Summary 
#> ----------------------
#> Call:
#> roclearn(X = X_train_lin, y = y_train_lin, lambda = 0.1)
#> 
#> Number of observations: 1500 
#> Number of predictors: 2 
#> 
#> Penalty: ridge 
#> Lambda: 0.1 
#> Loss function: hinge 
#> Approximation: Yes (B = 1500 )
#> 
#> Intercept: 0.0873 
#> Converged: TRUE 
#> Iterations: 1426 
#> Training time (sec): 0.05 
#> 
#> Coefficients (first 10):
#>        V1        V2 
#> 0.4844960 0.5398616

n_test_lin <- 300
n_pos_test_lin <- round(0.2 * n_test_lin)
n_neg_test_lin <- n_test_lin - n_pos_test_lin
X_test_lin <- rbind(
  matrix(rnorm(2 * n_neg_test_lin, mean = -1), ncol = 2),
  matrix(rnorm(2 * n_pos_test_lin, mean =  1), ncol = 2)
)
y_test_lin <- c(rep(-1, n_neg_test_lin), rep(1, n_pos_test_lin))

# Predict decision scores
pred_score_lin <- predict(fit_lin, X_test_lin, type = "response")
head(pred_score_lin)
#> [1]  0.7313566 -0.5813919  0.1561111 -1.0342995 -1.1555102 -0.5528892

# Predict classes {-1, 1}
pred_class_lin <- predict(fit_lin, X_test_lin, type = "class")
head(pred_class_lin)
#> [1]  1 -1  1 -1 -1 -1

# AUC on the test set
auc(fit_lin, X_test_lin, y_test_lin)
#> [1] 0.9665972

Example: Kernel Model

set.seed(123)
n_ker <- 1500
r_train_ker <- sqrt(runif(n_ker, 0.05, 1))
theta_train_ker <- runif(n_ker, 0, 2*pi)
X_train_ker <- cbind(r_train_ker * cos(theta_train_ker), r_train_ker * sin(theta_train_ker))
y_train_ker <- ifelse(r_train_ker < 0.5, 1, -1)

# Fit a kernel model
fit_ker <- kroclearn(X_train_ker, y_train_ker, lambda = 0.1, kernel = "radial")

# Summary
summary(fit_ker)
#> Kernel Model Summary 
#> -----------------------------
#> Call:
#> kroclearn(X = X_train_ker, y = y_train_ker, lambda = 0.1, kernel = "radial")
#> 
#> Number of observations: 1500 
#> Number of predictors: 2 
#> 
#> Kernel: radial 
#> Kernel parameter: 0.5 
#> Lambda: 0.1 
#> Loss function: hinge 
#> Approximation: Yes (Nystrom, B = 1500 )
#>   Nystrom landmarks: 39 
#> 
#> Intercept: -1.421 
#> Converged: TRUE 
#> Iterations: 891 
#> Training time (sec): 0.18 
#> 
#> Theta coefficients (first 10):
#>  [1] -0.07135333  0.25788424  0.12964322 -0.05642086  0.19505142  0.65529538
#>  [7]  0.08096419  0.47000919  0.13780996  0.40101365
#> ...

n_test_ker <- 300
r_test_ker <- sqrt(runif(n_test_ker, 0.05, 1))
theta_test_ker <- runif(n_test_ker, 0, 2*pi)
X_test_ker <- cbind(r_test_ker * cos(theta_test_ker), r_test_ker * sin(theta_test_ker))
y_test_ker <- ifelse(r_test_ker < 0.5, 1, -1)

# Predict decision scores
pred_score_ker <- predict(fit_ker, X_test_ker, type = "response")
head(pred_score_ker)
#> [1] -0.2853417 -0.1786982 -0.2537492 -0.1018808 -0.7098034  0.1883996

# Predict classes {-1, 1}
pred_class_ker <- predict(fit_ker, X_test_ker, type = "class")
head(pred_class_ker)
#> [1] -1 -1 -1 -1 -1  1

# AUC on the test set
auc(fit_ker, X_test_ker, y_test_ker)
#> [1] 0.9970842

Cross-Validation

# 5-fold CV for linear models
cvfit_lin <- cv.roclearn(
  X_train_lin, y_train_lin,
  lambda.vec = exp(seq(log(0.01), log(5), length.out = 20)),
  nfolds = 5
)

# Summarize the cross-validation result
summary(cvfit_lin)
#> Cross-validated Linear Model 
#> -----------------------
#> Loss: hinge 
#> Penalty: ridge 
#> Number of folds: 5 
#> Number of candidate lambdas: 20 
#> 
#> Optimal lambda: 0.1898714 
#> Cross-validated AUC at optimal lambda: 0.9791 (+/- 0.0040) 
#> 
#> Overall AUC (mean +/- sd) across lambdas:
#>    lambda auc.mean auc.sd
#> 1  0.0100   0.9783 0.0040
#> 2  0.0139   0.9790 0.0040
#> 3  0.0192   0.9790 0.0040
#> 4  0.0267   0.9789 0.0037
#> 5  0.0370   0.9789 0.0039
#> 6  0.0513   0.9784 0.0045
#> 7  0.0712   0.9789 0.0039
#> 8  0.0987   0.9785 0.0040
#> 9  0.1370   0.9786 0.0040
#> 10 0.1900   0.9791 0.0040
#> ...
# Plot the cross-validation AUC across lambda values
plot(cvfit_lin)

# 5-fold CV for kernel models
cvfit_ker <- cv.kroclearn(
  X_train_ker, y_train_ker,
  lambda.vec = exp(seq(log(0.01), log(5), length.out = 20)),
  kernel = "radial",
  nfolds = 5
)

# Summarize the cross-validation result
summary(cvfit_ker)
#> Cross-validated Kernel Model 
#> ------------------------------
#> Loss: hinge 
#> Kernel: radial 
#> Number of folds: 5 
#> Number of candidate lambdas: 20 
#> 
#> Optimal lambda: 0.03700021 
#> Cross-validated AUC at optimal lambda: 0.9997 (+/- 0.0004) 
#> 
#> Overall AUC (mean +/- sd) across lambdas:
#>    lambda auc.mean auc.sd
#> 1  0.0100   0.9988 0.0008
#> 2  0.0139   0.9996 0.0003
#> 3  0.0192   0.9996 0.0004
#> 4  0.0267   0.9993 0.0004
#> 5  0.0370   0.9997 0.0004
#> 6  0.0513   0.9990 0.0008
#> 7  0.0712   0.9989 0.0009
#> 8  0.0987   0.9961 0.0024
#> 9  0.1370   0.9832 0.0159
#> 10 0.1900   0.9868 0.0138
#> ...
# Plot the cross-validation AUC across lambda values
plot(cvfit_ker)

Real Data Example 1: Ionosphere

library(mlbench)
#> Warning: package 'mlbench' was built under R version 4.4.3
data(Ionosphere)

# Prepare data
X_iono <- Ionosphere[, -35]
y_iono <- ifelse(Ionosphere$Class == "bad", 1, -1)

set.seed(123)
train_idx <- sample(seq_len(nrow(X_iono)), size = 200)
X_train_iono <- X_iono[train_idx, ]
y_train_iono <- y_iono[train_idx]
X_test_iono  <- X_iono[-train_idx, ]
y_test_iono  <- y_iono[-train_idx]

# Fit a linear model
fit_iono_lin <- roclearn(X_train_iono, y_train_iono, lambda = 0.1, approx=TRUE)
#> Warning: Removing constant columns: V2
#> [V1]: treated as categorical and one-hot encoded.
summary(fit_iono_lin)
#> Linear Model Summary 
#> ----------------------
#> Call:
#> roclearn(X = X_train_iono, y = y_train_iono, lambda = 0.1, approx = TRUE)
#> 
#> Number of observations: 200 
#> Number of predictors: 33 
#> 
#> Penalty: ridge 
#> Lambda: 0.1 
#> Loss function: hinge 
#> Approximation: Yes (B = 200 )
#> 
#> Intercept: 1.3041 
#> Converged: TRUE 
#> Iterations: 1164 
#> Training time (sec): 0.04 
#> 
#> Coefficients (first 10):
#>          V3          V4          V5          V6          V7          V8 
#> -0.40674510 -0.18973373 -0.42204641 -0.22075386 -0.17022688 -0.43640505 
#>          V9         V10         V11         V12 
#> -0.12857765 -0.24904508  0.08956809 -0.11456464 
#> ...
#> 
#> Categorical variables dummy-encoded:
#> [1] "V1"
#> Removed columns:
#> [1] "V2"

# Predict decision scores
pred_score_iono_lin <- predict(fit_iono_lin, X_test_iono, type = "response")
head(pred_score_iono_lin)
#> [1]  0.7813989 -0.4333132  1.0361641  1.5973041  0.2972169 -0.4974081

# Predict classes {-1, 1}
pred_class_iono_lin <- predict(fit_iono_lin, X_test_iono, type = "class")
head(pred_class_iono_lin)
#> [1]  1 -1  1  1  1 -1

# AUC on the test set
auc(fit_iono_lin, X_test_iono, y_test_iono)
#> [1] 0.9184803
# 5-fold CV for linear models
cvfit_iono_lin <- cv.roclearn(
  X_train_iono, y_train_iono,
  lambda.vec = exp(seq(log(0.01), log(5), length.out = 10)),
  approx=TRUE, nfolds=5)
#> Warning: Removing constant columns: V2
#> [V1]: treated as categorical and one-hot encoded.
summary(cvfit_iono_lin)
#> Cross-validated Linear Model 
#> -----------------------
#> Loss: hinge 
#> Penalty: ridge 
#> Number of folds: 5 
#> Number of candidate lambdas: 10 
#> 
#> Optimal lambda: 0.3158114 
#> Cross-validated AUC at optimal lambda: 0.8633 (+/- 0.0965) 
#> 
#> Overall AUC (mean +/- sd) across lambdas:
#>    lambda auc.mean auc.sd
#> 1  0.0100   0.8072 0.1232
#> 2  0.0199   0.8236 0.1222
#> 3  0.0398   0.8408 0.0943
#> 4  0.0794   0.8624 0.0794
#> 5  0.1580   0.8501 0.0866
#> 6  0.3160   0.8633 0.0965
#> 7  0.6300   0.8489 0.0896
#> 8  1.2600   0.8250 0.1270
#> 9  2.5100   0.8068 0.1361
#> 10 5.0000   0.7776 0.1213
# Plot the cross-validation AUC across lambda values
plot(cvfit_iono_lin)

# Fit a kernel model
fit_iono_ker <- kroclearn(X_train_iono, y_train_iono, lambda = 0.1, kernel = "radial", approx=TRUE)
#> Warning: Removing constant columns: V2
#> [V1]: treated as categorical and one-hot encoded.
summary(fit_iono_ker)
#> Kernel Model Summary 
#> -----------------------------
#> Call:
#> kroclearn(X = X_train_iono, y = y_train_iono, lambda = 0.1, kernel = "radial", 
#>     approx = TRUE)
#> 
#> Number of observations: 200 
#> Number of predictors: 33 
#> 
#> Kernel: radial 
#> Kernel parameter: 0.03030303 
#> Lambda: 0.1 
#> Loss function: hinge 
#> Approximation: Yes (Nystrom, B = 200 )
#>   Nystrom landmarks: 14 
#> 
#> Intercept: 0.7425 
#> Converged: TRUE 
#> Iterations: 1276 
#> Training time (sec): 0.01 
#> 
#> Theta coefficients (first 10):
#>  [1] -0.87955477  0.22849258 -0.33857814 -0.67966979 -0.04122377 -0.17061431
#>  [7]  0.08362898 -0.76337429  0.35662077 -0.52693062
#> ...
#> 
#> Categorical variables dummy-encoded:
#> [1] "V1"
#> Removed columns:
#> [1] "V2"

# Predict decision scores
pred_score_iono_ker <- predict(fit_iono_ker, X_test_iono, type = "response")
head(pred_score_iono_ker)
#> [1]  0.1672607 -0.6231523  0.2522829  0.4332831 -0.2635053 -0.6066902

# Predict classes {-1, 1}
pred_class_iono_ker <- predict(fit_iono_ker, X_test_iono, type = "class")
head(pred_class_iono_ker)
#> [1]  1 -1  1  1 -1 -1

# AUC on the test set
auc(fit_iono_ker, X_test_iono, y_test_iono)
#> [1] 0.8999618
# 5-fold CV for kernel models
cvfit_iono_ker <- cv.kroclearn(
  X_train_iono, y_train_iono,
  lambda.vec = exp(seq(log(0.01), log(5), length.out = 10)),
  kernel = "radial",
  approx=TRUE, nfolds=5)
#> Warning: Removing constant columns: V2
#> [V1]: treated as categorical and one-hot encoded.
summary(cvfit_iono_ker)
#> Cross-validated Kernel Model 
#> ------------------------------
#> Loss: hinge 
#> Kernel: radial 
#> Number of folds: 5 
#> Number of candidate lambdas: 10 
#> 
#> Optimal lambda: 0.01994737 
#> Cross-validated AUC at optimal lambda: 0.9153 (+/- 0.0226) 
#> 
#> Overall AUC (mean +/- sd) across lambdas:
#>    lambda auc.mean auc.sd
#> 1  0.0100   0.8981 0.0560
#> 2  0.0199   0.9153 0.0226
#> 3  0.0398   0.8671 0.0773
#> 4  0.0794   0.8608 0.0530
#> 5  0.1580   0.8189 0.0580
#> 6  0.3160   0.8189 0.0665
#> 7  0.6300   0.7965 0.0450
#> 8  1.2600   0.7944 0.0879
#> 9  2.5100   0.8041 0.0649
#> 10 5.0000   0.8137 0.0387
# Plot the cross-validation AUC across lambda values
plot(cvfit_iono_ker)

Real Data Example 2: Spam (larger dataset)

library(kernlab)
data(spam)

# Prepare data
X_spam <- spam[, -58]
y_spam <- ifelse(spam$type == "spam", 1, -1)

set.seed(123)
train_idx <- sample(seq_len(nrow(X_spam)), size = 3000)
X_train_spam <- X_spam[train_idx, ]
y_train_spam <- y_spam[train_idx]
X_test_spam  <- X_spam[-train_idx, ]
y_test_spam  <- y_spam[-train_idx]

# Fit a linear model
fit_spam_lin <- roclearn(X_train_spam, y_train_spam, lambda = 0.1)
summary(fit_spam_lin)
#> Linear Model Summary 
#> ----------------------
#> Call:
#> roclearn(X = X_train_spam, y = y_train_spam, lambda = 0.1)
#> 
#> Number of observations: 3000 
#> Number of predictors: 57 
#> 
#> Penalty: ridge 
#> Lambda: 0.1 
#> Loss function: hinge 
#> Approximation: Yes (B = 3000 )
#> 
#> Intercept: -0.5147 
#> Converged: TRUE 
#> Iterations: 10 
#> Training time (sec): 0.01 
#> 
#> Coefficients (first 10):
#>          make       address           all         num3d           our 
#>  4.321339e-06 -5.145267e-06  1.017238e-05  5.731498e-06  1.621501e-05 
#>          over        remove      internet         order          mail 
#>  5.582747e-06  1.344279e-05  7.710516e-06  3.596203e-06  7.202070e-06 
#> ...

# Predict decision scores
pred_score_spam_lin <- predict(fit_spam_lin, X_test_spam, type = "response")
head(pred_score_spam_lin)
#>          3          6          8         10         14         15 
#> 10.4085857 -0.2464476 -0.2763554  2.8799679 -0.3900616  0.4386678

# Predict classes {-1, 1}
pred_class_spam_lin <- predict(fit_spam_lin, X_test_spam, type = "class")
head(pred_class_spam_lin)
#>  3  6  8 10 14 15 
#>  1 -1 -1  1 -1  1

# AUC on the test set
auc(fit_spam_lin, X_test_spam, y_test_spam)
#> [1] 0.7624179
# 5-fold CV for linear models 
cvfit_spam_lin <- cv.roclearn(
  X_train_spam, y_train_spam,
  lambda.vec = exp(seq(log(0.01), log(5), length.out = 10)), nfolds=5)
summary(cvfit_spam_lin)
#> Cross-validated Linear Model 
#> -----------------------
#> Loss: hinge 
#> Penalty: ridge 
#> Number of folds: 5 
#> Number of candidate lambdas: 10 
#> 
#> Optimal lambda: 0.3158114 
#> Cross-validated AUC at optimal lambda: 0.7777 (+/- 0.0232) 
#> 
#> Overall AUC (mean +/- sd) across lambdas:
#>    lambda auc.mean auc.sd
#> 1  0.0100   0.7715 0.0167
#> 2  0.0199   0.7764 0.0194
#> 3  0.0398   0.7741 0.0215
#> 4  0.0794   0.7748 0.0220
#> 5  0.1580   0.7758 0.0252
#> 6  0.3160   0.7777 0.0232
#> 7  0.6300   0.7735 0.0233
#> 8  1.2600   0.7753 0.0192
#> 9  2.5100   0.7715 0.0199
#> 10 5.0000   0.7722 0.0224
# Plot the cross-validation AUC across lambda values
plot(cvfit_spam_lin)

# Fit a kernel model
fit_spam_ker <- kroclearn(X_train_spam, y_train_spam, lambda = 0.1, kernel = "radial")
summary(fit_spam_ker)
#> Kernel Model Summary 
#> -----------------------------
#> Call:
#> kroclearn(X = X_train_spam, y = y_train_spam, lambda = 0.1, kernel = "radial")
#> 
#> Number of observations: 3000 
#> Number of predictors: 57 
#> 
#> Kernel: radial 
#> Kernel parameter: 0.01754386 
#> Lambda: 0.1 
#> Loss function: hinge 
#> Approximation: Yes (Nystrom, B = 3000 )
#>   Nystrom landmarks: 55 
#> 
#> Intercept: 5e-04 
#> Converged: TRUE 
#> Iterations: 607 
#> Training time (sec): 0.78 
#> 
#> Theta coefficients (first 10):
#>  [1] -0.106694761  0.034588410 -0.155193553 -0.091769201  0.008358426
#>  [6]  0.039672217 -0.191470994 -0.079506402  0.013024674 -0.468039804
#> ...

# Predict decision scores
pred_score_spam_ker <- predict(fit_spam_ker, X_test_spam, type = "response")
head(pred_score_spam_ker)
#>             3             6             8            10            14 
#>  0.0005386429 -0.0461043933 -0.1537764925  0.0005388521 -0.3817359714 
#>            15 
#>  0.0005386429

# Predict classes {-1, 1}
pred_class_spam_ker <- predict(fit_spam_ker, X_test_spam, type = "class")
head(pred_class_spam_ker)
#>  3  6  8 10 14 15 
#>  1 -1 -1  1 -1  1

# AUC on the test set
auc(fit_spam_ker, X_test_spam, y_test_spam)
#> [1] 0.7513072
# 5-fold CV for kernel models 
cvfit_spam_ker <- cv.kroclearn(
  X_train_spam, y_train_spam,
  kernel = "radial", 
  lambda.vec = exp(seq(log(0.01), log(5), length.out = 10)), nfolds=5)
summary(cvfit_spam_ker)
#> Cross-validated Kernel Model 
#> ------------------------------
#> Loss: hinge 
#> Kernel: radial 
#> Number of folds: 5 
#> Number of candidate lambdas: 10 
#> 
#> Optimal lambda: 5 
#> Cross-validated AUC at optimal lambda: 0.7385 (+/- 0.0146) 
#> 
#> Overall AUC (mean +/- sd) across lambdas:
#>    lambda auc.mean auc.sd
#> 1  0.0100   0.7303 0.0146
#> 2  0.0199   0.7173 0.0339
#> 3  0.0398   0.7301 0.0139
#> 4  0.0794   0.7287 0.0167
#> 5  0.1580   0.7258 0.0184
#> 6  0.3160   0.7196 0.0162
#> 7  0.6300   0.7281 0.0295
#> 8  1.2600   0.7331 0.0168
#> 9  2.5100   0.7259 0.0292
#> 10 5.0000   0.7385 0.0146
# Plot the cross-validation AUC across lambda values
plot(cvfit_spam_ker)

Conclusion

The roclab package implements ROC-Optimizing Binary Classifiers through both linear models (with regularization penalties) and kernel models (with various kernel functions). It supports multiple surrogate loss functions and incorporates scalable options to efficiently handle large datasets.