---
title: "Advanced Real Data Examples: Zero-Inflation, Semicontinuous, and Longitudinal Growth"
shorttitle: "Advanced Real Data Examples"
author: 
- name: "Frédéric Bertrand"
  affiliation: 
  - Cedric, Cnam, Paris
  email: frederic.bertrand@lecnam.net
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Advanced Real Data Examples: Zero-Inflation, Semicontinuous, and Longitudinal Growth}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
#file.edit(normalizePath("~/.Renviron"))
LOCAL <- identical(Sys.getenv("LOCAL"), "TRUE")
#LOCAL=TRUE
knitr::opts_chunk$set(purl = LOCAL)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

This vignette demonstrates **SelectBoost.gamlss** on three additional *real* datasets:
- **Zero-inflated counts** (`pscl::bioChemists`): ZIP / ZINB
- **Semicontinuous** (`datasets::airquality` Ozone): ZAGA
- **Longitudinal growth with random effects** (`nlme::Orthodont`): random intercepts

## What you'll learn

* How to fit `sb_gamlss()` with grouped penalties on real zero-inflated data sets.
* How to adapt SelectBoost scopes to semicontinuous responses (ZAGA) and longitudinal models with random effects.
* How to interpret `selection_table()` outputs when categorical terms are represented by grouped dummy variables.

> **Conference highlight.** These examples build on the communication *"An Improvement for Variable Selection for Generalized Additive Models for Location, Shape and Scale and Quantile Regression"* (Frédéric Bertrand & Myriam Maumy) delivered at the Joint Statistics Meetings 2024 in Portland, emphasizing SelectBoost-driven variable selection for GAMLSS and quantile regression through resampling-intensive benchmarking.

```{r}
library(SelectBoost.gamlss)
library(gamlss)
```

> Optional engines: `install.packages(c("glmnet","grpreg","SGL","future.apply"))`

## 1. Zero-inflated counts — `pscl::bioChemists` (ZIP/ZINB)

```{r, cache=TRUE, eval=LOCAL}
bc <- NULL
if (requireNamespace("pscl", quietly = TRUE)) {
  data("bioChemists", package = "pscl")
  bc <- pscl::bioChemists
  needed <- c("art", "kid5", "phd", "fem", "mar", "ment")
  missing_cols <- setdiff(needed, names(bc))
  if (length(missing_cols)) {
    cat(
      "bioChemists dataset is missing columns:",
      paste(missing_cols, collapse = ", "),
      "— skipping ZIP example.\n"
    )
    bc <- NULL
  } else {
    keep <- stats::complete.cases(bc[, needed])
    bc <- bc[keep, , drop = FALSE]
    bc$fem <- factor(bc$fem)
    bc$mar <- factor(bc$mar)
    bc$kid5 <- factor(bc$kid5)

    # ZIP engine via stepwise (mu) + grouped/elastic features
    set.seed(10)
    fit_zip <- sb_gamlss(
      art ~ 1, data = bc, family = gamlss.dist::ZIP(),
      mu_scope = ~ kid5 + fem + mar + phd + ment,
      engine = "grpreg", grpreg_penalty = "grLasso",
      B = 80, pi_thr = 0.6, pre_standardize = FALSE, parallel = "auto", trace = FALSE
    )
    sel_zip <- selection_table(fit_zip)
    sel_zip <- sel_zip[order(sel_zip$parameter, -sel_zip$prop), , drop = FALSE]
    head(sel_zip, n = min(8L, nrow(sel_zip)))
    stable_zip <- sel_zip[sel_zip$prop >= fit_zip$pi_thr, , drop = FALSE]
    if (nrow(stable_zip)) {
      stable_zip
    } else {
      cat("No terms reached the stability threshold of", fit_zip$pi_thr, "for this run.\n")
    }
  }
} else {
  cat("Package 'pscl' not installed; skipping bioChemists example.\n")
}
```

The tables above report the eight most stable terms (sorted within each parameter) together with the subset that exceeds the stability threshold `pi_thr`.

### ZINB often handles overdispersion better than ZIP
```{r, cache=TRUE, eval=LOCAL}
if (requireNamespace("pscl", quietly = TRUE) && !is.null(bc)) {
  set.seed(11)
  fit_zinb <- sb_gamlss(
    art ~ 1, data = bc, family = gamlss.dist::ZINBI(),
    mu_scope = ~ kid5 + fem + mar + phd + ment,
    engine = "grpreg", grpreg_penalty = "grLasso",
    B = 80, pi_thr = 0.6, pre_standardize = FALSE, parallel = "auto", trace = FALSE
  )
  sel_zinb <- selection_table(fit_zinb)
  sel_zinb <- sel_zinb[order(sel_zinb$parameter, -sel_zinb$prop), , drop = FALSE]
  head(sel_zinb, n = min(8L, nrow(sel_zinb)))
  stable_zinb <- sel_zinb[sel_zinb$prop >= fit_zinb$pi_thr, , drop = FALSE]
  if (nrow(stable_zinb)) {
    stable_zinb
  } else {
    cat("No terms reached the stability threshold of", fit_zinb$pi_thr, "for this run.\n")
  }
} else if (!requireNamespace("pscl", quietly = TRUE)) {
  cat("Package 'pscl' not installed; skipping bioChemists example.\n")
} else {
  cat("bioChemists dataset unavailable; skipping ZINBI example.\n")
}
```

### Side-by-side metrics (ZIP vs ZINB)
```{r, cache=TRUE, eval=LOCAL}
if (requireNamespace("pscl", quietly = TRUE)) {
  if (exists("fit_zip", inherits = FALSE) && exists("fit_zinb", inherits = FALSE)) {
    zip_g <- fit_zip$final_fit
    zinb_g <- fit_zinb$final_fit
    if (inherits(zip_g, "gamlss") && inherits(zinb_g, "gamlss")) {
      safe_scalar <- function(expr) {
        out <- tryCatch(expr, error = function(e) NA_real_)
        if (length(out) == 0L) NA_real_ else as.numeric(out[1L])
      }
      met <- data.frame(
        model = c("ZIP", "ZINBI"),
        AIC = c(safe_scalar(AIC(zip_g)), safe_scalar(AIC(zinb_g))),
        GAIC_k2 = c(safe_scalar(gamlss::GAIC(zip_g, k = 2)), safe_scalar(gamlss::GAIC(zinb_g, k = 2))),
        deviance = c(safe_scalar(zip_g$deviance), safe_scalar(zinb_g$deviance))
      )
      if (any(!is.na(met$AIC)) || any(!is.na(met$GAIC_k2)) || any(!is.na(met$deviance))) {
        print(met)
      } else {
        cat("Model fit statistics unavailable; skipping metric table.\n")
      }
    } else {
      cat("Final fits not found; metrics skipped.\n")
    }
  } else {
    cat("Models not available; run the fitting chunks first.\n")
  }
} else {
  cat("Package 'pscl' not installed; skipping bioChemists example.\n")
}
```

**Notes.** Many `art` values are zero; `ZINB` usually outperforms `ZIP` when overdispersion is present. Group lasso treats factor levels as single grouped terms.

---

## 2. Semicontinuous (ZAGA) — `airquality::Ozone`

`Ozone` is non-negative and has zeros; positive part is continuous — ideal for **ZAGA** (zero-adjusted Gamma).

```{r, cache=TRUE, eval=LOCAL}
aq <- subset(airquality, !is.na(Ozone) & !is.na(Wind) & !is.na(Temp) & !is.na(Solar.R))
aq$Month <- factor(aq$Month)

set.seed(20)
fit_zaga <- sb_gamlss(
  Ozone ~ 1, data = aq, family = gamlss.dist::ZAGA(),
  mu_scope    = ~ pb(Temp) + pb(Wind) + pb(Solar.R) + Month,
  sigma_scope = ~ pb(Temp),
  engine = "grpreg", grpreg_penalty = "grLasso",
  B = 80, pi_thr = 0.6, pre_standardize = TRUE, parallel = "auto", trace = FALSE
)
sel_zaga <- selection_table(fit_zaga)
sel_zaga <- sel_zaga[order(sel_zaga$parameter, -sel_zaga$prop), , drop = FALSE]
head(sel_zaga, n = min(8L, nrow(sel_zaga)))
stable_zaga <- sel_zaga[sel_zaga$prop >= fit_zaga$pi_thr, , drop = FALSE]
if (nrow(stable_zaga)) {
  stable_zaga
} else {
  cat("No terms reached the stability threshold of", fit_zaga$pi_thr, "for this run.\n")
}
```

As with the count model, we present the eight most persistent effects plus the subset clearing the stability requirement.

### Effect plot for Temp on mu (ZAGA)
```{r, cache=TRUE, eval=LOCAL}
if (requireNamespace('ggplot2', quietly = TRUE)) {
  print(effect_plot(fit_zaga, 'Temp', aq, what = 'mu'))
}
```

**Notes.** `ZAGA` models a point mass at zero plus a Gamma for the positive part. Group selection keeps whole spline bases together.

---

## 3. Longitudinal growth with random effects — `nlme::Orthodont`

Fit **random intercepts** by subject using `gamlss::random()`, while selecting fixed effects for μ.

```{r, cache=TRUE, eval=LOCAL}
if (requireNamespace("nlme", quietly = TRUE)) {
  data_ok <- tryCatch({
    utils::data("Orthodont", package = "nlme", envir = environment())
    TRUE
  }, error = function(e) FALSE)
  if (!data_ok || !exists("Orthodont", inherits = FALSE)) {
    cat("Dataset 'Orthodont' unavailable; skipping longitudinal example.\n")
  } else {
    od <- get("Orthodont", inherits = FALSE)
    rm(Orthodont)
    od$Subject <- factor(od$Subject)
    od$Sex <- factor(od$Sex)
    set.seed(30)
    fit_long <- sb_gamlss(
      distance ~ gamlss::random(Subject),     # random intercept
      data = od, family = gamlss.dist::NO(),
      mu_scope = ~ pb(age) + Sex + pb(age):Sex,      # select fixed effects
      engine = "grpreg", grpreg_penalty = "grLasso",
      B = 80, pi_thr = 0.6, pre_standardize = TRUE, parallel = "auto", trace = FALSE
    )
  sel_long <- selection_table(fit_long)
  sel_long <- sel_long[order(sel_long$parameter, -sel_long$prop), , drop = FALSE]
  head(sel_long, n = min(8L, nrow(sel_long)))
  stable_long <- sel_long[sel_long$prop >= fit_long$pi_thr, , drop = FALSE]
  if (nrow(stable_long)) {
    stable_long
  } else {
    cat("No terms reached the stability threshold of", fit_long$pi_thr, "for this run.\n")
  }
    if (requireNamespace("ggplot2", quietly = TRUE)) {
      gg <- tryCatch(
        effect_plot(fit_long, "age", od, what = "mu"),
        error = function(e) {
          message("effect_plot() skipped for random-effect example: ", conditionMessage(e))
          NULL
        }
      )
      if (!is.null(gg)) {
        print(gg)
      }
    }
  }
} else {
  cat("Package 'nlme' not installed; skipping longitudinal example.\n")
}
```

**Notes.** The `random(Subject)` term stays in the base formula (not in `mu_scope`) so it is always included, while SelectBoost focuses on selecting the **fixed** structure. Some plotting helpers (e.g., `effect_plot()`) may be skipped when random-effect predictions are unavailable in a given setup.