--- title: "Using e2tree with XGBoost, GBM, LightGBM, and CatBoost" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Using e2tree with XGBoost, GBM, LightGBM, and CatBoost} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE # examples require optional packages; set eval=TRUE locally ) ``` ## Overview Starting from version 1.1.0, `e2tree` supports the following tree ensemble backends in addition to `randomForest` and `ranger`: | Package | Model class | Task | |---------|-------------|------| | **xgboost** | `xgb.Booster` | classification, regression | | **gbm** | `gbm` | classification, regression | | **lightgbm** | `lgb.Booster` | classification, regression | | **catboost** | `catboost.CatBoost` / `catboost.Model` | classification, regression | The workflow is identical regardless of the backend: train a model, build the dissimilarity matrix with `createDisMatrix()`, then call `e2tree()`. --- ## XGBoost ### Classification (iris) ```{r xgb-clf} library(e2tree) if (!require("xgboost")) install.packages("xgboost", repos="https://cran.r-project.org") library(xgboost) data(iris) set.seed(42) n <- floor(0.75 * nrow(iris)) tr <- iris[sample(nrow(iris), n), ] va <- iris[setdiff(seq_len(nrow(iris)), as.integer(rownames(tr))), ] # XGBoost requires a numeric matrix and 0-indexed integer labels X_tr <- as.matrix(tr[, 1:4]) y_tr <- as.integer(tr$Species) - 1L dm_tr <- xgb.DMatrix(data = X_tr, label = y_tr) ensemble <- xgb.train( params = list(objective = "multi:softmax", num_class = 3, max_depth = 4, eta = 0.3), data = dm_tr, nrounds = 100, verbose = 0 ) # Attach the response back to the data.frame so the formula in e2tree() # can find it; createDisMatrix() will use it to annotate the dissimilarity # matrix (in classification, `label` is optional but recommended). tr_xgb <- tr[, 1:4] tr_xgb$Species <- tr$Species D <- createDisMatrix(ensemble, data = tr_xgb, label = "Species", parallel = list(active = FALSE, no_cores = 1)) setting <- list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5) tree_xgb <- e2tree(Species ~ ., data = tr_xgb, D = D, ensemble = ensemble, setting = setting) print(tree_xgb) ``` ### Regression (mtcars) For regression backends, `createDisMatrix()` needs the response column to compute the dissimilarity scale. Pass the full data frame (predictors plus response) and the name of the response column via the `label` argument. ```{r xgb-reg} library(xgboost) data(mtcars) set.seed(42) n <- floor(0.75 * nrow(mtcars)) tr <- mtcars[sample(nrow(mtcars), n), ] X_tr <- as.matrix(tr[, -1]) y_tr <- tr$mpg dm_tr <- xgb.DMatrix(data = X_tr, label = y_tr) ensemble <- xgb.train( params = list(objective = "reg:squarederror", max_depth = 4, eta = 0.3), data = dm_tr, nrounds = 100, verbose = 0 ) # `data = tr` carries the response column too; the XGBoost adapter # automatically trims the matrix to the features used at training time. D <- createDisMatrix(ensemble, data = tr, label = "mpg", parallel = list(active = FALSE, no_cores = 1)) tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble, setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5)) print(tree) ``` --- ## GBM ### Classification (iris, binary) `gbm` expects a 0/1 numeric response for the `bernoulli` distribution, while `e2tree` expects a factor response for classification. We therefore train `gbm` on the integer column and pass a factor copy of the same column to `e2tree`. ```{r gbm-clf} if (!require("gbm")) install.packages("gbm", repos="https://cran.r-project.org") library(gbm) data(iris) set.seed(42) df <- iris df$is_setosa <- as.integer(df$Species == "setosa") df$is_setosa_fct <- factor(df$is_setosa, levels = c(0L, 1L)) n <- floor(0.75 * nrow(df)) tr <- df[sample(nrow(df), n), ] ensemble <- gbm(is_setosa ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = tr, distribution = "bernoulli", n.trees = 200, interaction.depth = 4, verbose = FALSE) D <- createDisMatrix(ensemble, data = tr[, c("Sepal.Length","Sepal.Width", "Petal.Length","Petal.Width", "is_setosa_fct")], label = "is_setosa_fct", parallel = list(active = FALSE, no_cores = 1)) tree <- e2tree(is_setosa_fct ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = tr, D = D, ensemble = ensemble, setting = list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5)) print(tree) ``` ### Regression (mtcars) `gbm` requires `nTrain * bag.fraction > 2 * n.minobsinnode + 1`, which fails on small training sets such as 24-row `mtcars` with the default settings. Lower `n.minobsinnode` and raise `bag.fraction` to keep the example self-contained. ```{r gbm-reg} library(gbm) data(mtcars) set.seed(42) n <- floor(0.75 * nrow(mtcars)) tr <- mtcars[sample(nrow(mtcars), n), ] ensemble <- gbm(mpg ~ ., data = tr, distribution = "gaussian", n.trees = 200, interaction.depth = 4, n.minobsinnode = 2, bag.fraction = 0.8, verbose = FALSE) D <- createDisMatrix(ensemble, data = tr, label = "mpg", parallel = list(active = FALSE, no_cores = 1)) tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble, setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5)) print(tree) ``` --- ## LightGBM ### Classification (iris) ```{r lgb-clf} if (!require("lightgbm")) install.packages("lightgbm", repos="https://cran.r-project.org") library(lightgbm) data(iris) set.seed(42) n <- floor(0.75 * nrow(iris)) tr <- iris[sample(nrow(iris), n), ] X_tr <- as.matrix(tr[, 1:4]) y_tr <- as.integer(tr$Species) - 1L ds <- lgb.Dataset(X_tr, label = y_tr) ensemble <- lgb.train( params = list(objective = "multiclass", num_class = 3, num_leaves = 15, verbose = -1), data = ds, nrounds = 100 ) tr_lgb <- tr[, 1:4] tr_lgb$Species <- tr$Species D <- createDisMatrix(ensemble, data = tr_lgb, label = "Species", parallel = list(active = FALSE, no_cores = 1)) tree <- e2tree(Species ~ ., data = tr_lgb, D = D, ensemble = ensemble, setting = list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5)) print(tree) ``` ### Regression (mtcars) ```{r lgb-reg} library(lightgbm) data(mtcars) set.seed(42) n <- floor(0.75 * nrow(mtcars)) tr <- mtcars[sample(nrow(mtcars), n), ] X_tr <- as.matrix(tr[, -1]) y_tr <- tr$mpg ds <- lgb.Dataset(X_tr, label = y_tr) ensemble <- lgb.train( params = list(objective = "regression", num_leaves = 8, min_data_in_leaf = 2, learning_rate = 0.1, verbose = -1), data = ds, nrounds = 200 ) # Pass the response column to createDisMatrix() via `label`. The # LightGBM adapter selects the columns it needs through the booster's # stored feature names, so any extra columns in `data` are ignored. D <- createDisMatrix(ensemble, data = tr, label = "mpg", parallel = list(active = FALSE, no_cores = 1)) tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble, setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5)) print(tree) ``` --- ## Adding a new backend To support a further model class `MyEnsemble`, implement three S3 methods and register them in `NAMESPACE`: ```r # In R/adapters.R (or a separate R/adapter_mymodel.R) get_ensemble_type.MyEnsemble <- function(ensemble) { # return "classification" or "regression" } extract_terminal_nodes.MyEnsemble <- function(ensemble, data) { # return data.frame of (n_obs × n_trees) terminal node IDs } get_ensemble_predictions.MyEnsemble <- function(ensemble, data, type) { # return numeric vector of length n_obs } ``` No changes to `createDisMatrix()`, `e2tree()`, or any other core function are required.