## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
.old_opts <- options(digits = 3)

## ----workflow-schematic, echo=FALSE, warning = FALSE, message= FALSE, fig.width=8, fig.height=4, fig.cap="Typical workflow: global test (top) and pointwise loss plot (bottom)."----
library(ggplot2)

box <- function(x, y, w, h, label) {
  data.frame(
    xmin = x - w / 2, xmax = x + w / 2,
    ymin = y - h / 2, ymax = y + h / 2,
    label = label, x = x, y = y
  )
}

top <- rbind(
  box(1.2, 5, 1.1, 0.55, "Data\n(df_gen or yours)"),
  box(3.1, 5, 1.0, 0.55, "Long format +\ncentered diffs"),
  box(4.9, 5.45, 0.85, 0.45, "calc_Z"),
  box(4.9, 4.55, 0.85, 0.45, "calc_eig"),
  box(6.8, 5, 0.95, 0.55, "calc_pval")
)
bot <- rbind(
  box(3.1, 2.6, 1.0, 0.55, "calc_L_s2"),
  box(5.0, 2.6, 0.85, 0.55, "plot_pcb")
)
rects <- rbind(top, bot)

seg <- data.frame(
  x = c(1.75, 2.65, 4.45, 4.45, 5.35, 5.35, 3.1, 3.65),
  y = c(5, 5, 5.45, 4.55, 5.45, 4.55, 4.45, 2.6),
  xend = c(2.6, 3.6, 5.35, 5.35, 6.35, 6.35, 3.1, 4.55),
  yend = c(5, 5, 5.45, 4.55, 5, 5, 3.15, 2.6)
)

ggplot() +
  geom_rect(
    data = rects,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    fill = "grey96", color = "grey35", size = 0.35
  ) +
  geom_text(data = rects, aes(x = x, y = y, label = label), size = 3) +
  geom_segment(
    data = seg,
    aes(x = x, y = y, xend = xend, yend = yend),
    arrow = grid::arrow(length = grid::unit(0.12, "cm"), type = "closed"),
    size = 0.35,
    color = "grey25"
  ) +
  annotate("text", x = 4.9, y = 5.95, label = "Global delta test (chi-square approx.)", size = 3.2, fontface = "italic") +
  annotate("text", x = 4.0, y = 1.85, label = "Pointwise mean loss + naive band", size = 3.2, fontface = "italic") +
  coord_cartesian(xlim = c(0.3, 7.5), ylim = c(1.5, 6.2), clip = "off") +
  theme_void() +
  theme(plot.margin = grid::unit(c(12, 8, 8, 8), "pt"))

## ----load, message = FALSE, include = FALSE-----------------------------------
require(dplyr)
require(tidyr)
require(gridExtra)
require(RSpectra)
require(rlist)

## ----include = FALSE----------------------------------------------------------
library(rtForecastEval)

## -----------------------------------------------------------------------------
library(ggplot2)
library(tibble)
library(MASS)
nsamp <- 100 # number of in-game events
ngame <- 200 # number of games (README example uses the same for comparable figures)

#' Parameter for generating the eigenvalues, and p-values
D <- 10 # Number of eigenvalues to keep
N_MC <- 5000 # for simulating the p-value
L <- function(x, y) {
  return((x - y) ^ 2)
}

# Data generation ---------------------------------------------------------
df_equ <- df_gen(N = nsamp, Ngame = ngame) %>%
  group_by(grid) %>%
  mutate(
    p_bar_12 = mean(phat_A - phat_B),
    diff_non_cent = phat_A - phat_B,
    diff_cent = phat_A - phat_B - p_bar_12
  ) %>% ungroup()

# Apply our test (explicit construction, as in utility.R) ----------------

Z <- df_equ %>% group_by(grid) %>%
  summarise(delta_n = mean(L(phat_A, Y) - L(phat_B, Y))) %>%
  {
    sum((.)$delta_n^2) / nsamp * ngame
  }

temp <- df_equ %>% group_split(grid, .keep = FALSE)

eigV_hat <- lapply(1:nsamp, function(i) {
  sapply(1:nsamp, function(j) {
    as.numeric(temp[[i]]$diff_non_cent %*% temp[[j]]$diff_non_cent / ngame)
  })
}) %>% list.rbind() %>% {
  RSpectra::eigs_sym(
    A = (.),
    k = D,
    which = "LM",
    opts = list(retvec = FALSE)
  )$values
} %>%
  {
    (.)/nsamp
  }

eigV_til <- lapply(1:nsamp, function(i) {
  sapply(1:nsamp, function(j) {
    as.numeric(temp[[i]]$diff_cent %*% temp[[j]]$diff_cent / ngame)
  })
}) %>% list.rbind() %>% {
  RSpectra::eigs_sym(
    A = (.),
    k = D,
    which = "LM",
    opts = list(retvec = FALSE)
  )$values
} %>%
  {
    (.)/nsamp
  }

MC_hat <- sapply(1:N_MC, function(x) {
  crossprod(eigV_hat, rchisq(D, df = 1))
})

q_90_hat <- quantile(MC_hat, 0.90)
q_95_hat <- quantile(MC_hat, 0.95)
q_99_hat <- quantile(MC_hat, 0.99)

MC_til <- sapply(1:N_MC, function(x) {
  crossprod(eigV_til, rchisq(D, df = 1))
})

q_90_til <- quantile(MC_til, 0.90)
q_95_til <- quantile(MC_til, 0.95)
q_99_til <- quantile(MC_til, 0.99)

p_hat <- 1 - ecdf(MC_hat)(Z)

tibble(
  type = c("non-center", "center"),
  Z = rep(Z, 2),
  "pval" = c(p_hat, p_hat),
  "90%" = c(q_90_hat, q_90_til),
  "95%" = c(q_95_hat, q_95_til),
  "99%" = c(q_99_hat, q_99_til)
)

## ----function wrappers, fig.width = 7, fig.height = 4.2, fig.cap = "Pointwise mean loss difference (A vs B) with naive normal band — simulation setting. This is a skill curve, not a calibration diagram."----
to_center <- FALSE

ZZ <- calc_Z(df = df_equ, pA = "phat_A", pB = "phat_B", Y = "Y", nsamp = nsamp, ngame = ngame)
eigg <- calc_eig(
  df = df_equ, n_eig = D, ngame = ngame,
  nsamp = nsamp, grid = "grid", cent = to_center
)
oh <- calc_pval(ZZ, eig = eigg, quan = c(0.90, 0.95, 0.99), n_MC = N_MC)

temp <- calc_L_s2(df = df_equ, pA = "phat_A", pB = "phat_B")

plot_pcb(df = temp)

tibble(
  type = ifelse(to_center, "center", "non-center"),
  Z = ZZ,
  pval = oh$p_val,
  "90%" = oh$quantile[1],
  "95%" = oh$quantile[2],
  "99%" = oh$quantile[3]
)

## ----calibration-reliability, fig.width = 7.5, fig.height = 4, fig.cap = "Binned reliability diagram at a fixed grid (closest to 0.5): mean outcome vs mean forecast for A and B; grey bars are exact 95% central Binomial range for mean(Y) under H0. Points near the diagonal indicate better marginal calibration at that time."----
g_mid <- df_equ %>%
  distinct(grid) %>%
  slice_min(order_by = abs(grid - 0.5), n = 1) %>%
  pull(grid)

slice_t <- df_equ %>%
  filter(grid == g_mid) %>%
  transmute(
    game,
    Y,
    phat_A,
    phat_B
  )

rel_long <- slice_t %>%
  tidyr::pivot_longer(c(phat_A, phat_B), names_to = "forecaster", values_to = "phat") %>%
  mutate(forecaster = ifelse(forecaster == "phat_A", "Forecaster A", "Forecaster B"))

rel_binned <- rel_long %>%
  group_by(forecaster) %>%
  mutate(bin = ntile(phat, 10)) %>%
  group_by(forecaster, bin) %>%
  summarise(
    mean_forecast = mean(phat),
    mean_outcome = mean(Y),
    n_games = dplyr::n(),
    .groups = "drop"
  ) %>%
  mutate(
    p_bin = pmin(pmax(mean_forecast, 0), 1),
    ci_lo = stats::qbinom(0.025, size = n_games, prob = p_bin) / n_games,
    ci_hi = stats::qbinom(0.975, size = n_games, prob = p_bin) / n_games
  )

ggplot(rel_binned, aes(mean_forecast, mean_outcome)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey45") +
  geom_errorbar(
    aes(x = mean_forecast, ymin = ci_lo, ymax = ci_hi),
    width = 0.02,
    color = "grey55",
    alpha = 0.85,
    linewidth = 0.35
  ) +
  geom_point(aes(size = n_games), alpha = 0.9, color = "darkred") +
  facet_wrap(~forecaster, nrow = 1) +
  coord_fixed(xlim = c(0, 1), ylim = c(0, 1), ratio = 1) +
  labs(
    title = "Binned reliability (calibration) at one grid",
    subtitle = paste0(
      "Grid = ", signif(g_mid, 3),
      " (closest to 0.5); grey: exact 95% Binomial range for mean(Y) under H0"
    ),
    x = "Mean forecast in bin",
    y = "Mean outcome (Y) in bin",
    size = "Games"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.minor = ggplot2::element_blank(),
    plot.title = ggplot2::element_text(face = "bold")
  )

## ----reliability-joint-difference, fig.width = 8, fig.height = 4.2, fig.cap = "Shared midpoint bins: two standard reliability diagrams (A and B); grey bars = exact 95% Binomial range for mean(Y) under H0."----
cal_bins <- slice_t %>%
  mutate(mid = (phat_A + phat_B) / 2) %>%
  mutate(bin = cut(mid, breaks = seq(0, 1, length.out = 11), include.lowest = TRUE))

rel_joint <- cal_bins %>%
  group_by(bin) %>%
  summarise(
    mean_forecast_A = mean(phat_A),
    mean_forecast_B = mean(phat_B),
    mean_outcome = mean(Y),
    n_games = dplyr::n(),
    .groups = "drop"
  ) %>%
  filter(!is.na(bin)) %>%
  mutate(
    p_A = pmin(pmax(mean_forecast_A, 0), 1),
    p_B = pmin(pmax(mean_forecast_B, 0), 1),
    ci_lo_A = stats::qbinom(0.025, size = n_games, prob = p_A) / n_games,
    ci_hi_A = stats::qbinom(0.975, size = n_games, prob = p_A) / n_games,
    ci_lo_B = stats::qbinom(0.025, size = n_games, prob = p_B) / n_games,
    ci_hi_B = stats::qbinom(0.975, size = n_games, prob = p_B) / n_games
  )

rel_joint_long <- rel_joint %>%
  tidyr::pivot_longer(
    c(mean_forecast_A, mean_forecast_B),
    names_to = "which",
    names_prefix = "mean_forecast_",
    values_to = "mean_forecast"
  ) %>%
  mutate(
    forecaster = ifelse(which == "A", "Forecaster A", "Forecaster B"),
    ci_lo = ifelse(which == "A", ci_lo_A, ci_lo_B),
    ci_hi = ifelse(which == "A", ci_hi_A, ci_hi_B)
  )

ggplot(rel_joint_long, aes(mean_forecast, mean_outcome)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey45") +
  geom_errorbar(
    aes(x = mean_forecast, ymin = ci_lo, ymax = ci_hi),
    width = 0.02,
    color = "grey55",
    alpha = 0.85,
    linewidth = 0.4
  ) +
  geom_point(aes(color = forecaster, size = n_games), alpha = 0.9) +
  facet_wrap(~forecaster, nrow = 1) +
  coord_fixed(xlim = c(0, 1), ylim = c(0, 1), ratio = 1) +
  scale_color_manual(values = c("Forecaster A" = "steelblue", "Forecaster B" = "orangered")) +
  labs(
    title = "Reliability: shared midpoint bins (one classical diagram per forecaster)",
    subtitle = "Bins from cut((phat_A + phat_B) / 2); grey = exact 95% Binomial H0 range for mean(Y)",
    x = "Mean forecast in bin",
    y = "Mean outcome (Y) in bin",
    color = NULL,
    size = "Games"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.minor = ggplot2::element_blank(),
    plot.title = ggplot2::element_text(face = "bold"),
    legend.position = "right"
  ) +
  guides(color = "none")

## ----calibration-compare-forecasters, fig.width = 7.5, fig.height = 4.5, fig.cap = "Mean calibration error per forecaster (solid) and difference in mean forecasts (dashed), binned by (phat_A + phat_B) / 2 at the same grid as the reliability figure."----
cal_compare <- cal_bins %>%
  group_by(bin) %>%
  summarise(
    mid_hat = mean(mid),
    cal_err_A = mean(Y) - mean(phat_A),
    cal_err_B = mean(Y) - mean(phat_B),
    n_games = dplyr::n(),
    .groups = "drop"
  ) %>%
  filter(!is.na(bin))

cal_long <- cal_compare %>%
  tidyr::pivot_longer(
    c(cal_err_A, cal_err_B),
    names_to = "which",
    values_to = "cal_err"
  ) %>%
  mutate(
    which = ifelse(which == "cal_err_A", "A: mean(Y) - phat_A", "B: mean(Y) - phat_B")
  )

ggplot(cal_long, aes(mid_hat, cal_err, color = which)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey70") +
  geom_line(linewidth = 0.85) +
  geom_point(aes(size = n_games), alpha = 0.85) +
  geom_line(
    data = cal_compare,
    aes(mid_hat, cal_err_B - cal_err_A),
    inherit.aes = FALSE,
    color = "grey35",
    linetype = "dashed",
    linewidth = 0.65
  ) +
  labs(
    title = "Calibration comparison (same midpoint bins)",
    subtitle = "Solid: mean calibration error per forecaster; dashed: mean(phat_B) - mean(phat_A)",
    x = "Mean (phat_A + phat_B) / 2 in bin",
    y = "mean(Y) - mean(phat)",
    color = "Curve",
    size = "Games"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.minor = ggplot2::element_blank(),
    plot.title = ggplot2::element_text(face = "bold"),
    legend.position = "bottom"
  )

## ----vignette-options-restore, include = FALSE--------------------------------
options(.old_opts)