---
title: "Compilation and Call Overhead"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Compilation and Call Overhead}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
knitr::knit_engines$set(rtinycc = Rtinycc:::rtinycc_engine)
tcc_bind <- Rtinycc::tcc_bind
tcc_compile <- Rtinycc::tcc_compile
tcc_ffi <- Rtinycc::tcc_ffi
tcc_source <- Rtinycc::tcc_source
has_callme <- requireNamespace("callme", quietly = TRUE)
has_bench <- requireNamespace("bench", quietly = TRUE)
```

```{css, echo=FALSE}
.rtinycc         { background-color: #E3F2FD; }
pre.rtinycc span { background-color: #E3F2FD; }
```

This article measures two different costs:

- compilation latency for a tiny module
- call overhead once the code is already compiled

The comparison target is the [`callme`](https://cran.r-project.org/package=callme)
package, which builds ordinary `.Call()` entry points with `R CMD SHLIB`. That
means it goes through the platform compiler toolchain (`gcc`/`clang` on the
usual Unix-like targets), so we should expect stronger optimization than TinyCC
for steady-state machine code. That does not make the comparison useless, but it
does mean the runtime results combine two effects:

- direct `.Call()` entry points and direct R C API allocation in `callme`
- better backend optimization from the system compiler

The point is not that the two packages expose identical APIs. They do not.
Instead, the comparison asks a narrower question:

- how much compile-time latency does in-memory TinyCC avoid?
- what is the extra runtime cost of Rtinycc's generated wrapper layer?
- how much does an extra copy matter when Rtinycc has to convert a returned C
  buffer into an R vector?

## Three Minimal Cases

We use three small workloads:

- `noop()`: takes nothing, returns nothing
- `fill_rand(out, n)`: fills a caller-provided numeric buffer in place
- `rand_unif(n)`: generates `n` random doubles

The `fill_rand()` case is the fairer array-oriented comparison:

- `Rtinycc` receives a `numeric_array`, so the wrapper borrows the backing
  `REAL()` storage of the R vector directly
- `callme` takes an R numeric vector and writes into `REAL(vec)` directly

The `rand_unif()` case intentionally stresses the extra copy path:

- `callme` allocates the final R vector directly with the R C API
- `Rtinycc` returns a heap-allocated `double*`, and the generated wrapper copies
  that buffer into a fresh R numeric vector before freeing the original C
  allocation

```{rtinycc, object = "rtinycc_code"}
#include <R.h>
#include <Rinternals.h>
#include <Rmath.h>
#include <stdlib.h>

void noop(void) {}

void fill_rand(double* out, int n) {
  if (n < 0) {
    Rf_error("n must be non-negative");
  }

  GetRNGstate();
  for (int i = 0; i < n; ++i) {
    out[i] = unif_rand();
  }
  PutRNGstate();
}

double* rand_unif(int n) {
  if (n < 0) {
    Rf_error("n must be non-negative");
  }
  if (n == 0) {
    return (double*) malloc(sizeof(double));
  }

  double *out = (double*) malloc(sizeof(double) * (size_t) n);
  if (!out) {
    Rf_error("malloc failed");
  }

  GetRNGstate();
  for (int i = 0; i < n; ++i) {
    out[i] = unif_rand();
  }
  PutRNGstate();
  return out;
}
```

```{rtinycc, object = "callme_code"}
#include <R.h>
#include <Rinternals.h>
#include <Rmath.h>

SEXP noop(void) {
  return R_NilValue;
}

SEXP fill_rand(SEXP out_, SEXP n_) {
  int n = asInteger(n_);
  if (n < 0) {
    Rf_error("n must be non-negative");
  }

  if (TYPEOF(out_) != REALSXP) {
    Rf_error("out must be a numeric vector");
  }

  if (XLENGTH(out_) < n) {
    Rf_error("out is shorter than n");
  }

  double *out = REAL(out_);
  GetRNGstate();
  for (int i = 0; i < n; ++i) {
    out[i] = unif_rand();
  }
  PutRNGstate();

  return out_;
}

SEXP rand_unif(SEXP n_) {
  int n = asInteger(n_);
  if (n < 0) {
    Rf_error("n must be non-negative");
  }

  SEXP out = PROTECT(allocVector(REALSXP, n));
  double *ptr = REAL(out);

  GetRNGstate();
  for (int i = 0; i < n; ++i) {
    ptr[i] = unif_rand();
  }
  PutRNGstate();

  UNPROTECT(1);
  return out;
}
```

```{r}
build_rtinycc_module <- function() {
  tcc_ffi() |>
    tcc_source(rtinycc_code) |>
    tcc_bind(
      noop = list(args = list(), returns = "void"),
      fill_rand = list(args = list("numeric_array", "i32"), returns = "void"),
      rand_unif = list(
        args = list("i32"),
        returns = list(type = "numeric_array", length_arg = 1, free = TRUE)
      )
    ) |>
    tcc_compile()
}

build_callme_module <- function() {
  before <- names(getLoadedDLLs())
  mod <- callme::compile(callme_code, env = NULL, verbosity = 0)
  dlls <- getLoadedDLLs()
  new_names <- setdiff(names(dlls), before)
  new_names <- new_names[startsWith(new_names, "callme_")]
  attr(mod, "dll_paths") <- unname(vapply(
    dlls[new_names],
    function(x) x[["path"]],
    character(1)
  ))
  mod
}

unload_callme_dlls <- function(dll_paths) {
  dll_paths <- rev(unique(dll_paths))
  if (is.null(dll_paths) || !length(dll_paths)) {
    return(invisible(NULL))
  }
  for (dll_path in dll_paths) {
    if (is.character(dll_path) && nzchar(dll_path) && file.exists(dll_path)) {
      try(dyn.unload(dll_path), silent = TRUE)
    }
  }
  invisible(NULL)
}

build_and_dispose_callme_module <- function() {
  mod <- build_callme_module()
  dll_paths <- attr(mod, "dll_paths", exact = TRUE)
  rm(mod)
  gc()
  unload_callme_dlls(dll_paths)
  invisible(NULL)
}

callme_runtime_reason <- NULL
can_run_callme <- FALSE

if (!has_callme) {
  callme_runtime_reason <- "`callme` is not installed."
} else if (.Platform$OS.type == "windows") {
  callme_runtime_reason <- paste(
    "`callme` comparisons are skipped on Windows during vignette builds",
    "because the helper DLL compilation step is not reliable in CI."
  )
} else {
  callme_probe <- tryCatch(
    {
      build_and_dispose_callme_module()
      NULL
    },
    error = identity
  )

  if (inherits(callme_probe, "error")) {
    callme_runtime_reason <- paste(
      "`callme` comparisons were skipped because runtime compilation failed:",
      conditionMessage(callme_probe)
    )
  } else {
    can_run_callme <- TRUE
  }
}

can_run_benchmarks <- can_run_callme && has_bench

if (is.null(callme_runtime_reason) && !has_bench) {
  callme_runtime_reason <- "`bench` is not installed."
} else if (is.null(callme_runtime_reason)) {
  callme_runtime_reason <- "Executable comparisons are enabled."
}

with_benchmark_modules <- function(fun) {
  rt_mod <- build_rtinycc_module()
  cm_mod <- build_callme_module()
  dll_paths <- attr(cm_mod, "dll_paths", exact = TRUE)

  on.exit({
    rm(rt_mod, cm_mod)
    gc()
    unload_callme_dlls(dll_paths)
  }, add = TRUE)

  fun(rt_mod, cm_mod)
}

median_elapsed <- function(expr, times = 3L) {
  expr <- substitute(expr)
  env <- parent.frame()
  stats::median(replicate(
    times,
    {
      gc()
      t0 <- proc.time()[["elapsed"]]
      eval(expr, envir = env)
      proc.time()[["elapsed"]] - t0
    }
  ))
}

run_noop <- function(fun, n) {
  for (i in seq_len(n)) {
    fun()
  }
  invisible(NULL)
}

run_rand <- function(fun, n, reps) {
  for (i in seq_len(reps)) {
    invisible(fun(n))
  }
  invisible(NULL)
}

run_fill <- function(fun, n, reps) {
  for (i in seq_len(reps)) {
    out <- numeric(n)
    invisible(fun(out, n))
  }
  invisible(NULL)
}

rtinycc_recipe <- tcc_ffi() |>
  tcc_source(rtinycc_code) |>
  tcc_bind(
    noop = list(args = list(), returns = "void"),
    fill_rand = list(args = list("numeric_array", "i32"), returns = "void"),
    rand_unif = list(
      args = list("i32"),
      returns = list(type = "numeric_array", length_arg = 1, free = TRUE)
    )
  )

generated_code <- Rtinycc:::generate_ffi_code(
  symbols = rtinycc_recipe$symbols,
  headers = rtinycc_recipe$headers,
  c_code = rtinycc_recipe$c_code,
  is_external = FALSE,
  structs = rtinycc_recipe$structs,
  unions = rtinycc_recipe$unions,
  enums = rtinycc_recipe$enums,
  globals = rtinycc_recipe$globals,
  container_of = rtinycc_recipe$container_of,
  field_addr = rtinycc_recipe$field_addr,
  struct_raw_access = rtinycc_recipe$struct_raw_access,
  introspect = rtinycc_recipe$introspect
)
```

## Availability

```{r}
has_callme
```

If `callme` or `bench` is unavailable, or if the current build environment
cannot compile the temporary `callme` helper DLL, the executable comparisons
below are skipped.

```{r}
has_bench
```

```{r}
can_run_callme
```

```{r}
can_run_benchmarks
```

Current comparison status:

```{r}
callme_runtime_reason
```

## Compilation Latency

This measures module build time, not call time.

```{r, eval = can_run_callme}
compile_times <- data.frame(
  implementation = c("Rtinycc", "callme"),
  seconds = c(
    median_elapsed(build_rtinycc_module(), times = 3L),
    median_elapsed(build_and_dispose_callme_module(), times = 3L)
  )
)

compile_times$milliseconds <- round(compile_times$seconds * 1000, 1)
compile_times
```

The expected pattern is:

- `Rtinycc` wins clearly on tiny compile latency because it stays in-process and
  does not shell out to `R CMD SHLIB`
- `callme` pays the ordinary shared-library toolchain cost

## Generated Wrapper Code

The generated code makes the extra return-path work explicit. In particular, the
`rand_unif()` wrapper allocates an R vector, `memcpy()`s the native `double*`
buffer into it, then `free()`s the original buffer. In contrast, `fill_rand()`
uses the borrowed `numeric_array` input path.

```{r, results='asis'}
Rtinycc:::rtinycc_c_block(generated_code)
```

## `noop()` Call Overhead

This is the smallest useful call path. It approximates the lower bound on call
overhead above a plain `.Call()` entry point.

```{r, eval = can_run_benchmarks}
noop_bench <- with_benchmark_modules(function(rt_mod, cm_mod) {
  n_noop <- 1000L

  bench::mark(
    Rtinycc = run_noop(rt_mod$noop, n_noop),
    callme = run_noop(cm_mod$noop, n_noop),
    iterations = 20,
    check = TRUE,
    memory = TRUE,
    filter_gc = FALSE
  )
})

noop_bench
```

Interpretation:

- the `callme` path is close to the cost of a conventional `.Call()` wrapper
- the `Rtinycc` path adds the generated wrapper layer and external-pointer call
  target
- the difference here is mostly boundary overhead, not useful computation
- `check = TRUE` is appropriate here because both expressions always return
  `NULL`
- `bench` also exposes allocation and GC differences directly, which is useful
  for understanding the cost of boxing and copying

## `fill_rand(out, n)` And Zero-Copy Arrays

This is the fairer vector comparison because both implementations fill an
existing R numeric vector instead of returning a newly allocated result.

```{r, eval = can_run_benchmarks}
fill_bench_n4096 <- with_benchmark_modules(function(rt_mod, cm_mod) {
  bench::mark(
    Rtinycc = run_fill(rt_mod$fill_rand, 4096L, 100L),
    callme = run_fill(cm_mod$fill_rand, 4096L, 100L),
    iterations = 20,
    check = FALSE,
    memory = TRUE,
    filter_gc = FALSE
  )
})

fill_bench_n4096
```

Interpretation:

- both sides now write into ordinary R numeric storage
- this removes the return-copy penalty from the comparison
- the remaining gap is mostly call boundary overhead plus backend code quality

## `rand_unif(n)` And Copy Cost

Here the implementation work is still small, but the return path differs:

- `callme` fills the final R vector directly
- `Rtinycc` fills a native buffer, then the wrapper copies into a fresh R vector

We time both a tiny and a larger return size.

```{r, eval = can_run_benchmarks}
rand_results <- with_benchmark_modules(function(rt_mod, cm_mod) {
  rand_bench_n1 <- bench::mark(
    Rtinycc = run_rand(rt_mod$rand_unif, 1L, 1000L),
    callme = run_rand(cm_mod$rand_unif, 1L, 1000L),
    iterations = 20,
    check = FALSE,
    memory = TRUE,
    filter_gc = FALSE
  )

  rand_bench_n4096 <- bench::mark(
    Rtinycc = run_rand(rt_mod$rand_unif, 4096L, 100L),
    callme = run_rand(cm_mod$rand_unif, 4096L, 100L),
    iterations = 20,
    check = FALSE,
    memory = TRUE,
    filter_gc = FALSE
  )

  list(rand_bench_n1 = rand_bench_n1, rand_bench_n4096 = rand_bench_n4096)
})

rand_results$rand_bench_n1
rand_results$rand_bench_n4096
```

The usual pattern is:

- for `fill_rand()`, the comparison is much closer to `Rtinycc`'s intended
  array-oriented usage
- for `n = 1`, wrapper overhead and return-path mechanics dominate
- for larger `n`, the copy still matters, but more of the time is spent in the
  actual loop and RNG generation

## What These Numbers Mean

The benchmark gives a reasonable mental model:

- `Rtinycc` is optimized for low compilation latency and direct interactive use
- for very small scalar calls, a traditional `.Call()` entry point has lower
  overhead
- when `Rtinycc` must copy returned buffers into R vectors, that copy is real
  and measurable
- part of the runtime gap is also expected backend quality: `callme` is using
  the system compiler, while `Rtinycc` is using TinyCC
- the main way to amortize the boundary cost is to do more work per call

So the package is usually strongest when:

- compile latency matters
- you want to bind plain C signatures quickly
- you batch work into array-oriented or coarse-grained calls

It is less ideal when:

- every microsecond of scalar-call overhead matters
- you can already afford and manage a regular shared-library toolchain
- you need a direct `.Call()` entry point that writes its final result straight
  into R-managed objects