--- title: "Autograd Engine" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Autograd Engine} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set(eval = TRUE) ``` ggmlR includes a PyTorch-style dynamic autograd engine built on top of ggml tensors. Gradients are computed via reverse-mode AD on a tape recorded inside `with_grad_tape({})`. **Tensor layout:** all autograd tensors are **column-major** — shape is `[features, batch]`, matching ggml's native C layout. ```{r} library(ggmlR) ``` --- ## 1. Core primitives ### `ag_tensor` and `ag_param` ```{r} # ag_tensor — non-trainable input (e.g. data) x <- ag_tensor(matrix(1:6 / 6, nrow = 2L)) # [2, 3] # ag_param — trainable parameter (gradient accumulated) W <- ag_param(matrix(rnorm(4), nrow = 2L)) # [2, 2] b <- ag_param(matrix(0.0, 2L, 1L)) ``` ### Forward pass and gradient tape ```{r} with_grad_tape({ h <- ag_relu(ag_add(ag_matmul(W, x), b)) # [2, 3] loss <- ag_mean(ag_mul(h, h)) # scalar MSE-like }) grads <- backward(loss) # returns named list of gradients cat("dL/dW:\n"); print(grads[["W"]]) cat("dL/db:\n"); print(grads[["b"]]) ``` `backward()` returns gradients keyed by the parameter's variable name. --- ## 2. Built-in operations | Function | Description | |----------|-------------| | `ag_matmul(A, B)` | Matrix multiply | | `ag_add(A, B)` | Element-wise add (broadcast supported) | | `ag_mul(A, B)` | Element-wise multiply | | `ag_relu(x)` | ReLU activation | | `ag_sigmoid(x)` | Sigmoid activation | | `ag_tanh(x)` | Tanh activation | | `ag_gelu(x)` | GELU activation | | `ag_softmax(x)` | Softmax (column-wise) | | `ag_sum(x)` | Sum all elements → scalar | | `ag_mean(x)` | Mean all elements → scalar | | `ag_transpose(x)` | Transpose [m,n] → [n,m] | | `ag_reshape(x, dims)` | Reshape tensor | | `ag_softmax_cross_entropy_loss(logits, y)` | Fused softmax + cross-entropy | --- ## 3. `ag_sequential` — module API `ag_sequential` stacks layers into a callable module with `.forward()` and `.parameters()`. ```{r} data(iris) set.seed(42) x_all <- t(scale(as.matrix(iris[, 1:4]))) # [4, 150] y_oh <- model.matrix(~ Species - 1, iris) y_all <- t(y_oh) # [3, 150] idx <- sample(150L) x_tr <- x_all[, idx[1:120]]; x_vl <- x_all[, idx[121:150]] y_tr <- y_all[, idx[1:120]]; y_vl <- y_all[, idx[121:150]] model <- ag_sequential( ag_linear(4L, 64L, activation = "relu"), ag_batch_norm(64L), ag_dropout(0.3), ag_linear(64L, 32L, activation = "relu"), ag_linear(32L, 3L) ) params <- model$parameters() cat("Parameter tensors:", length(params), "\n") ``` --- ## 4. Optimizers ```{r} # Adam (default lr = 1e-3) opt <- optimizer_adam(params, lr = 1e-3) # SGD with momentum opt_sgd <- optimizer_sgd(params, lr = 0.05, momentum = 0.9) ``` ### Training loop ```{r} BS <- 32L n <- ncol(x_tr) ag_train(model) # set training mode (enables dropout, batch norm train) set.seed(1) for (ep in seq_len(150L)) { perm <- sample(n) for (b in seq_len(ceiling(n / BS))) { idx <- perm[seq((b-1L)*BS + 1L, min(b*BS, n))] xb <- ag_tensor(x_tr[, idx, drop = FALSE]) yb <- y_tr[, idx, drop = FALSE] with_grad_tape({ loss <- ag_softmax_cross_entropy_loss(model$forward(xb), yb) }) grads <- backward(loss) opt$step(grads) opt$zero_grad() } if (ep %% 50L == 0L) cat(sprintf("epoch %3d loss %.4f\n", ep, loss$data[1])) } ``` --- ## 5. LR schedulers ```{r} opt2 <- optimizer_adam(params, lr = 1e-3) # Cosine annealing: lr goes from lr_max to lr_min over T_max epochs sch_cos <- lr_scheduler_cosine(opt2, T_max = 150L, lr_min = 1e-5) # Step decay: multiply lr by gamma every step_size epochs sch_step <- lr_scheduler_step(opt2, step_size = 30L, gamma = 0.5) # Call after each epoch: # sch_cos$step() ``` --- ## 6. Gradient clipping ```{r} with_grad_tape({ loss <- ag_softmax_cross_entropy_loss(model$forward(ag_tensor(x_tr)), y_tr) }) grads <- backward(loss) # Clip global gradient norm to max_norm clip_grad_norm(params, grads, max_norm = 5.0) opt$step(grads) opt$zero_grad() ``` --- ## 7. Dataloader `ag_dataloader` shuffles and batches column-major data matrices: ```{r} dl <- ag_dataloader(x_tr, y_tr, batch_size = BS, shuffle = TRUE) ag_train(model) for (ep in seq_len(100L)) { for (batch in dl$epoch()) { with_grad_tape({ loss <- ag_softmax_cross_entropy_loss(model$forward(batch$x), batch$y$data) }) grads <- backward(loss) opt$step(grads); opt$zero_grad() } } ``` --- ## 8. Eval mode and inference ```{r} ag_eval(model) # disables dropout, switches batch norm to inference stats # Forward in chunks to avoid memory pressure predict_cm <- function(mod, x_cm, chunk = 64L) { n <- ncol(x_cm) out <- NULL for (s in seq(1L, n, by = chunk)) { e <- min(s + chunk - 1L, n) lg <- mod$forward(ag_tensor(x_cm[, s:e, drop = FALSE]))$data ev <- exp(lg - apply(lg, 2, max)) sm <- ev / colSums(ev) out <- if (is.null(out)) sm else cbind(out, sm) } out } probs <- predict_cm(model, x_vl) # [3, 30] preds <- apply(probs, 2, which.max) truth <- apply(y_vl, 1, which.max) cat(sprintf("Val accuracy: %.4f\n", mean(preds == truth))) ``` --- ## 9. Raw `ag_param` — full manual control For complete flexibility, build the network from scratch: ```{r} set.seed(7) W1 <- ag_param(matrix(rnorm(64*4) * sqrt(2/4), 64, 4)) b1 <- ag_param(matrix(0.0, 64, 1)) W2 <- ag_param(matrix(rnorm(3*64) * sqrt(2/64), 3, 64)) b2 <- ag_param(matrix(0.0, 3, 1)) forward <- function(x) ag_add(ag_matmul(W2, ag_relu(ag_add(ag_matmul(W1, x), b1))), b2) opt_raw <- optimizer_adam(list(W1=W1, b1=b1, W2=W2, b2=b2), lr = 1e-3) for (ep in seq_len(200L)) { perm <- sample(n) for (b in seq_len(ceiling(n / BS))) { idx <- perm[seq((b-1L)*BS+1L, min(b*BS, n))] xb <- ag_tensor(x_tr[, idx, drop = FALSE]) yb <- y_tr[, idx, drop = FALSE] with_grad_tape({ loss_r <- ag_softmax_cross_entropy_loss(forward(xb), yb) }) gr <- backward(loss_r) opt_raw$step(gr); opt_raw$zero_grad() } } ``` --- ## 10. Mixed precision ```{r} # f16 on GPU, f32 on CPU device <- tryCatch({ ag_device("gpu"); "gpu" }, error = function(e) "cpu") ag_dtype(if (device == "gpu") "f16" else "f32") # All subsequent ag_param / ag_tensor use the selected dtype ``` See also `vignette("gpu-vulkan", package = "ggmlR")` for device management. --- ## 11. Data-parallel training For multi-GPU or faster single-GPU training see `dp_train()`: ```{r} # Full example: inst/examples/dp_train_demo.R ``` See also `vignette("data-parallel-training", package = "ggmlR")`.