---
title: "Autograd Engine"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Autograd Engine}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(eval = TRUE)
```

ggmlR includes a PyTorch-style dynamic autograd engine built on top of ggml
tensors. Gradients are computed via reverse-mode AD on a tape recorded inside
`with_grad_tape({})`.

**Tensor layout:** all autograd tensors are **column-major** — shape is
`[features, batch]`, matching ggml's native C layout.

```{r}
library(ggmlR)
```

---

## 1. Core primitives

### `ag_tensor` and `ag_param`

```{r}
# ag_tensor — non-trainable input (e.g. data)
x <- ag_tensor(matrix(1:6 / 6, nrow = 2L))   # [2, 3]

# ag_param — trainable parameter (gradient accumulated)
W <- ag_param(matrix(rnorm(4), nrow = 2L))    # [2, 2]
b <- ag_param(matrix(0.0, 2L, 1L))
```

### Forward pass and gradient tape

```{r}
with_grad_tape({
  h    <- ag_relu(ag_add(ag_matmul(W, x), b))   # [2, 3]
  loss <- ag_mean(ag_mul(h, h))                  # scalar MSE-like
})

grads <- backward(loss)   # returns named list of gradients

cat("dL/dW:\n"); print(grads[["W"]])
cat("dL/db:\n"); print(grads[["b"]])
```

`backward()` returns gradients keyed by the parameter's variable name.

---

## 2. Built-in operations

| Function | Description |
|----------|-------------|
| `ag_matmul(A, B)` | Matrix multiply |
| `ag_add(A, B)` | Element-wise add (broadcast supported) |
| `ag_mul(A, B)` | Element-wise multiply |
| `ag_relu(x)` | ReLU activation |
| `ag_sigmoid(x)` | Sigmoid activation |
| `ag_tanh(x)` | Tanh activation |
| `ag_gelu(x)` | GELU activation |
| `ag_softmax(x)` | Softmax (column-wise) |
| `ag_sum(x)` | Sum all elements → scalar |
| `ag_mean(x)` | Mean all elements → scalar |
| `ag_transpose(x)` | Transpose [m,n] → [n,m] |
| `ag_reshape(x, dims)` | Reshape tensor |
| `ag_softmax_cross_entropy_loss(logits, y)` | Fused softmax + cross-entropy |

---

## 3. `ag_sequential` — module API

`ag_sequential` stacks layers into a callable module with `.forward()` and
`.parameters()`.

```{r}
data(iris)
set.seed(42)

x_all <- t(scale(as.matrix(iris[, 1:4])))        # [4, 150]
y_oh  <- model.matrix(~ Species - 1, iris)
y_all <- t(y_oh)                                  # [3, 150]

idx  <- sample(150L)
x_tr <- x_all[, idx[1:120]];  x_vl <- x_all[, idx[121:150]]
y_tr <- y_all[, idx[1:120]];  y_vl <- y_all[, idx[121:150]]

model <- ag_sequential(
  ag_linear(4L,  64L, activation = "relu"),
  ag_batch_norm(64L),
  ag_dropout(0.3),
  ag_linear(64L, 32L, activation = "relu"),
  ag_linear(32L,  3L)
)

params <- model$parameters()
cat("Parameter tensors:", length(params), "\n")
```

---

## 4. Optimizers

```{r}
# Adam (default lr = 1e-3)
opt <- optimizer_adam(params, lr = 1e-3)

# SGD with momentum
opt_sgd <- optimizer_sgd(params, lr = 0.05, momentum = 0.9)
```

### Training loop

```{r}
BS <- 32L
n  <- ncol(x_tr)

ag_train(model)   # set training mode (enables dropout, batch norm train)
set.seed(1)

for (ep in seq_len(150L)) {
  perm <- sample(n)
  for (b in seq_len(ceiling(n / BS))) {
    idx <- perm[seq((b-1L)*BS + 1L, min(b*BS, n))]
    xb  <- ag_tensor(x_tr[, idx, drop = FALSE])
    yb  <- y_tr[, idx, drop = FALSE]

    with_grad_tape({
      loss <- ag_softmax_cross_entropy_loss(model$forward(xb), yb)
    })
    grads <- backward(loss)
    opt$step(grads)
    opt$zero_grad()
  }

  if (ep %% 50L == 0L)
    cat(sprintf("epoch %3d  loss %.4f\n", ep, loss$data[1]))
}
```

---

## 5. LR schedulers

```{r}
opt2 <- optimizer_adam(params, lr = 1e-3)

# Cosine annealing: lr goes from lr_max to lr_min over T_max epochs
sch_cos <- lr_scheduler_cosine(opt2, T_max = 150L, lr_min = 1e-5)

# Step decay: multiply lr by gamma every step_size epochs
sch_step <- lr_scheduler_step(opt2, step_size = 30L, gamma = 0.5)

# Call after each epoch:
# sch_cos$step()
```

---

## 6. Gradient clipping

```{r}
with_grad_tape({
  loss <- ag_softmax_cross_entropy_loss(model$forward(ag_tensor(x_tr)), y_tr)
})
grads <- backward(loss)

# Clip global gradient norm to max_norm
clip_grad_norm(params, grads, max_norm = 5.0)

opt$step(grads)
opt$zero_grad()
```

---

## 7. Dataloader

`ag_dataloader` shuffles and batches column-major data matrices:

```{r}
dl <- ag_dataloader(x_tr, y_tr, batch_size = BS, shuffle = TRUE)

ag_train(model)
for (ep in seq_len(100L)) {
  for (batch in dl$epoch()) {
    with_grad_tape({
      loss <- ag_softmax_cross_entropy_loss(model$forward(batch$x), batch$y$data)
    })
    grads <- backward(loss)
    opt$step(grads);  opt$zero_grad()
  }
}
```

---

## 8. Eval mode and inference

```{r}
ag_eval(model)   # disables dropout, switches batch norm to inference stats

# Forward in chunks to avoid memory pressure
predict_cm <- function(mod, x_cm, chunk = 64L) {
  n   <- ncol(x_cm)
  out <- NULL
  for (s in seq(1L, n, by = chunk)) {
    e  <- min(s + chunk - 1L, n)
    lg <- mod$forward(ag_tensor(x_cm[, s:e, drop = FALSE]))$data
    ev <- exp(lg - apply(lg, 2, max))
    sm <- ev / colSums(ev)
    out <- if (is.null(out)) sm else cbind(out, sm)
  }
  out
}

probs <- predict_cm(model, x_vl)          # [3, 30]
preds <- apply(probs, 2, which.max)
truth <- apply(y_vl, 1, which.max)
cat(sprintf("Val accuracy: %.4f\n", mean(preds == truth)))
```

---

## 9. Raw `ag_param` — full manual control

For complete flexibility, build the network from scratch:

```{r}
set.seed(7)
W1 <- ag_param(matrix(rnorm(64*4) * sqrt(2/4),  64, 4))
b1 <- ag_param(matrix(0.0, 64, 1))
W2 <- ag_param(matrix(rnorm(3*64) * sqrt(2/64),  3, 64))
b2 <- ag_param(matrix(0.0,  3, 1))

forward <- function(x)
  ag_add(ag_matmul(W2, ag_relu(ag_add(ag_matmul(W1, x), b1))), b2)

opt_raw <- optimizer_adam(list(W1=W1, b1=b1, W2=W2, b2=b2), lr = 1e-3)

for (ep in seq_len(200L)) {
  perm <- sample(n)
  for (b in seq_len(ceiling(n / BS))) {
    idx <- perm[seq((b-1L)*BS+1L, min(b*BS, n))]
    xb  <- ag_tensor(x_tr[, idx, drop = FALSE])
    yb  <- y_tr[, idx, drop = FALSE]
    with_grad_tape({ loss_r <- ag_softmax_cross_entropy_loss(forward(xb), yb) })
    gr <- backward(loss_r)
    opt_raw$step(gr);  opt_raw$zero_grad()
  }
}
```

---

## 10. Mixed precision

```{r}
# f16 on GPU, f32 on CPU
device <- tryCatch({ ag_device("gpu"); "gpu" }, error = function(e) "cpu")
ag_dtype(if (device == "gpu") "f16" else "f32")

# All subsequent ag_param / ag_tensor use the selected dtype
```

See also `vignette("gpu-vulkan", package = "ggmlR")` for device management.

---

## 11. Data-parallel training

For multi-GPU or faster single-GPU training see `dp_train()`:

```{r}
# Full example: inst/examples/dp_train_demo.R
```

See also `vignette("data-parallel-training", package = "ggmlR")`.