## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(eval = TRUE)

## -----------------------------------------------------------------------------
# install.packages("ggmlR")   # once on CRAN
library(ggmlR)

## -----------------------------------------------------------------------------
data(iris)
set.seed(42)

x <- as.matrix(iris[, 1:4])
x <- scale(x)                        # standardise

# one-hot encode species
y <- model.matrix(~ Species - 1, iris)  # [150 x 3]

idx   <- sample(nrow(x))
n_tr  <- 120L
x_tr  <- x[idx[1:n_tr],  ]
y_tr  <- y[idx[1:n_tr],  ]
x_val <- x[idx[(n_tr+1):150], ]
y_val <- y[idx[(n_tr+1):150], ]

## -----------------------------------------------------------------------------
model <- ggml_model_sequential() |>
  ggml_layer_dense(32L, activation = "relu", input_shape = 4L) |>
  ggml_layer_batch_norm() |>
  ggml_layer_dropout(0.3, stochastic = TRUE) |>
  ggml_layer_dense(16L, activation = "relu") |>
  ggml_layer_dense(3L,  activation = "softmax") |>
  ggml_compile(
    optimizer = "adam",
    loss      = "categorical_crossentropy",
    metrics   = c("accuracy")
  )

print(model)

## -----------------------------------------------------------------------------
model <- ggml_fit(
  model,
  x_tr, y_tr,
  epochs           = 100L,
  batch_size       = 32L,
  validation_split = 0.0,   # we supply our own val set below
  verbose          = 0L
)

## -----------------------------------------------------------------------------
score <- ggml_evaluate(model, x_val, y_val, batch_size = 16L)
cat(sprintf("Val loss: %.4f   Val accuracy: %.4f\n", score$loss, score$accuracy))

probs   <- ggml_predict(model, x_val, batch_size = 16L)
classes <- apply(probs, 1, which.max)
true    <- apply(y_val, 1, which.max)
cat("Confusion matrix:\n")
print(table(true = true, predicted = classes))

## -----------------------------------------------------------------------------
path <- tempfile(fileext = ".rds")
ggml_save_model(model, path)
cat(sprintf("Saved: %s (%.1f KB)\n", path, file.size(path) / 1024))

model2 <- ggml_load_model(path)
score2 <- ggml_evaluate(model2, x_val, y_val, batch_size = 16L)
cat(sprintf("Reloaded model accuracy: %.4f\n", score2$accuracy))

## -----------------------------------------------------------------------------
inp  <- ggml_input(shape = 4L)

h    <- inp |> ggml_layer_dense(32L, activation = "relu") |> ggml_layer_batch_norm()
h    <- h   |> ggml_layer_dense(16L, activation = "relu")
out  <- h   |> ggml_layer_dense(3L,  activation = "softmax")

model_fn <- ggml_model(inputs = inp, outputs = out) |>
  ggml_compile(optimizer = "adam", loss = "categorical_crossentropy",
               metrics = c("accuracy"))

model_fn <- ggml_fit(model_fn, x_tr, y_tr,
                     epochs = 100L, batch_size = 32L, verbose = 0L)

score_fn <- ggml_evaluate(model_fn, x_val, y_val, batch_size = 16L)
cat(sprintf("Functional model accuracy: %.4f\n", score_fn$accuracy))

## -----------------------------------------------------------------------------
data(mtcars)
set.seed(7)

# Input group 1: engine (disp, hp, wt)
# Input group 2: transmission / gearbox (cyl, gear, carb, am)
engine <- as.matrix(scale(mtcars[, c("disp","hp","wt")]))
trans  <- as.matrix(scale(mtcars[, c("cyl","gear","carb","am")]))
y_mpg  <- matrix(scale(mtcars$mpg), ncol = 1L)          # [32 x 1]

# small dataset — use all for training, evaluate on same data for demo
x1 <- engine;  x2 <- trans

inp1 <- ggml_input(shape = 3L, name = "engine")
inp2 <- ggml_input(shape = 4L, name = "transmission")

branch1 <- inp1 |> ggml_layer_dense(16L, activation = "relu")
branch2 <- inp2 |> ggml_layer_dense(16L, activation = "relu")

merged  <- ggml_layer_add(list(branch1, branch2))        # element-wise add
out_reg <- merged |>
  ggml_layer_dense(8L, activation = "relu") |>
  ggml_layer_dense(1L)

model_reg <- ggml_model(inputs = list(inp1, inp2), outputs = out_reg) |>
  ggml_compile(optimizer = "adam", loss = "mse")

model_reg <- ggml_fit(model_reg,
                      x = list(x1, x2), y = y_mpg,
                      epochs = 200L, batch_size = 16L, verbose = 0L)

preds <- ggml_predict(model_reg, x = list(x1, x2), batch_size = 16L)
cat(sprintf("Pearson r (scaled mpg): %.4f\n", cor(preds, y_mpg)))

## -----------------------------------------------------------------------------
cb_stop <- ggml_callback_early_stopping(
  monitor   = "val_loss",
  patience  = 15L,
  min_delta = 1e-4
)

model_cb <- ggml_model_sequential() |>
  ggml_layer_dense(32L, activation = "relu", input_shape = 4L) |>
  ggml_layer_dense(3L,  activation = "softmax") |>
  ggml_compile(optimizer = "adam", loss = "categorical_crossentropy")

model_cb <- ggml_fit(model_cb, x_tr, y_tr,
                     epochs           = 300L,
                     batch_size       = 32L,
                     validation_split = 0.1,
                     callbacks        = list(cb_stop),
                     verbose          = 0L)

## -----------------------------------------------------------------------------
# Cosine annealing
cb_cosine <- ggml_schedule_cosine_decay(T_max = 100L, eta_min = 1e-5)

# Step decay: halve LR every 30 epochs
cb_step <- ggml_schedule_step_decay(step_size = 30L, gamma = 0.5)

# Reduce on plateau
cb_plateau <- ggml_schedule_reduce_on_plateau(
  monitor  = "val_loss",
  factor   = 0.5,
  patience = 10L,
  min_lr   = 1e-6
)

model_lr <- ggml_model_sequential() |>
  ggml_layer_dense(32L, activation = "relu", input_shape = 4L) |>
  ggml_layer_dense(3L,  activation = "softmax") |>
  ggml_compile(optimizer = "adam", loss = "categorical_crossentropy")

model_lr <- ggml_fit(model_lr, x_tr, y_tr,
                     epochs           = 150L,
                     batch_size       = 32L,
                     validation_split = 0.1,
                     callbacks        = list(cb_cosine),
                     verbose          = 0L)

## -----------------------------------------------------------------------------
# transpose: rows = features, cols = samples
x_tr_ag  <- t(x_tr)    # [4, 120]
y_tr_ag  <- t(y_tr)    # [3, 120]
x_val_ag <- t(x_val)   # [4, 30]
y_val_ag <- t(y_val)

## -----------------------------------------------------------------------------
ag_mod <- ag_sequential(
  ag_linear(4L,  32L, activation = "relu"),
  ag_batch_norm(32L),
  ag_dropout(0.3),
  ag_linear(32L, 16L, activation = "relu"),
  ag_linear(16L,  3L)
)

params <- ag_mod$parameters()
opt    <- optimizer_adam(params, lr = 1e-3)

## -----------------------------------------------------------------------------
BS <- 32L
n  <- ncol(x_tr_ag)

ag_train(ag_mod)
set.seed(42)

for (ep in seq_len(150L)) {
  perm <- sample(n)
  for (b in seq_len(ceiling(n / BS))) {
    idx <- perm[seq((b-1L)*BS + 1L, min(b*BS, n))]
    xb  <- ag_tensor(x_tr_ag[, idx, drop = FALSE])
    yb  <- y_tr_ag[, idx, drop = FALSE]

    with_grad_tape({
      loss <- ag_softmax_cross_entropy_loss(ag_mod$forward(xb), yb)
    })
    grads <- backward(loss)
    opt$step(grads)
    opt$zero_grad()
  }

  if (ep %% 50L == 0L)
    cat(sprintf("epoch %d  loss %.4f\n", ep, loss$data[1]))
}

## -----------------------------------------------------------------------------
ag_eval(ag_mod)

# forward in chunks, apply softmax manually
ag_predict_cm <- function(model, x_cm, chunk = 64L) {
  n   <- ncol(x_cm)
  out <- matrix(0.0, nrow(model$forward(ag_tensor(x_cm[,1,drop=FALSE]))$data), n)
  for (s in seq(1L, n, by = chunk)) {
    e  <- min(s + chunk - 1L, n)
    lg <- model$forward(ag_tensor(x_cm[, s:e, drop = FALSE]))$data
    ev <- exp(lg - apply(lg, 2, max))
    out[, s:e] <- ev / colSums(ev)
  }
  out
}

probs_ag <- ag_predict_cm(ag_mod, x_val_ag)          # [3, 30]
pred_ag  <- apply(probs_ag, 2, which.max)
true_ag  <- apply(y_val_ag, 1, which.max)            # col-major: rows = classes
acc_ag   <- mean(pred_ag == true_ag)
cat(sprintf("Autograd val accuracy: %.4f\n", acc_ag))

## -----------------------------------------------------------------------------
ag_mod2 <- ag_sequential(
  ag_linear(4L,  64L, activation = "relu"),
  ag_batch_norm(64L),
  ag_dropout(0.3),
  ag_linear(64L, 32L, activation = "relu"),
  ag_linear(32L,  3L)
)
params2 <- ag_mod2$parameters()
opt2    <- optimizer_adam(params2, lr = 1e-3)
sch2    <- lr_scheduler_cosine(opt2, T_max = 150L, lr_min = 1e-5)

ag_train(ag_mod2)
set.seed(42)

for (ep in seq_len(150L)) {
  perm <- sample(n)
  for (b in seq_len(ceiling(n / BS))) {
    idx <- perm[seq((b-1L)*BS + 1L, min(b*BS, n))]
    xb  <- ag_tensor(x_tr_ag[, idx, drop = FALSE])
    yb  <- y_tr_ag[, idx, drop = FALSE]

    with_grad_tape({
      loss2 <- ag_softmax_cross_entropy_loss(ag_mod2$forward(xb), yb)
    })
    grads2 <- backward(loss2)
    clip_grad_norm(params2, grads2, max_norm = 5.0)
    opt2$step(grads2)
    opt2$zero_grad()
  }
  sch2$step()
}

ag_eval(ag_mod2)
probs2 <- ag_predict_cm(ag_mod2, x_val_ag)
acc2   <- mean(apply(probs2, 2, which.max) == true_ag)
cat(sprintf("ag + cosine + clip  val accuracy: %.4f\n", acc2))

## -----------------------------------------------------------------------------
make_net <- function(n_in, n_hidden, n_out) {
  W1 <- ag_param(matrix(rnorm(n_hidden * n_in) * sqrt(2/n_in),  n_hidden, n_in))
  b1 <- ag_param(matrix(0.0, n_hidden, 1L))
  W2 <- ag_param(matrix(rnorm(n_out * n_hidden) * sqrt(2/n_hidden), n_out, n_hidden))
  b2 <- ag_param(matrix(0.0, n_out, 1L))

  list(
    forward    = function(x)
      ag_add(ag_matmul(W2, ag_relu(ag_add(ag_matmul(W1, x), b1))), b2),
    parameters = function() list(W1=W1, b1=b1, W2=W2, b2=b2)
  )
}

set.seed(1)
net    <- make_net(4L, 32L, 3L)
opt_r  <- optimizer_adam(net$parameters(), lr = 1e-3)

for (ep in seq_len(200L)) {
  perm <- sample(n)
  for (b in seq_len(ceiling(n / BS))) {
    idx <- perm[seq((b-1L)*BS+1L, min(b*BS, n))]
    xb  <- ag_tensor(x_tr_ag[, idx, drop = FALSE])
    yb  <- y_tr_ag[, idx, drop = FALSE]
    with_grad_tape({ loss_r <- ag_softmax_cross_entropy_loss(net$forward(xb), yb) })
    gr <- backward(loss_r)
    opt_r$step(gr);  opt_r$zero_grad()
  }
}

probs_r <- ag_predict_cm(net, x_val_ag)
acc_r   <- mean(apply(probs_r, 2, which.max) == true_ag)
cat(sprintf("Raw ag_param val accuracy: %.4f\n", acc_r))

## -----------------------------------------------------------------------------
# Use Vulkan GPU if available, fall back to CPU
device <- tryCatch({
  ag_device("gpu")
  "gpu"
}, error = function(e) "cpu")

cat("Running on:", device, "\n")

# Mixed precision (f16 on GPU, f32 on CPU)
ag_dtype(if (device == "gpu") "f16" else "f32")