## -----------------------------------------------------------------------------
#| label: setup
#| include: false
#| cache: false

library(vegan)
library(dplyr)
library(tidyr)
library(ggplot2)
library(dggridR)
library(knitr)

set.seed(19)


## ----warning=FALSE, message=FALSE, eval=TRUE----------------------------------
#| label: bt_package_loading
#| cache: false
# Install and load the latest version of BioTIMEr
library(BioTIMEr)


## -----------------------------------------------------------------------------
#| label: data_description
#| cache: false
# you can run the following commands to retrieve more information about the subsets.
?BTsubset_meta
?BTsubset_data


## -----------------------------------------------------------------------------
#| label: "package info"
#| cache: false
# you can also see a full list of BioTIMEr functions and help pages by:
??BioTIMEr


## -----------------------------------------------------------------------------
#| label: tbl-gridding_ex1
grid_samples <- gridding(
  meta = BTsubset_meta,
  btf = BTsubset_data,
  res = 12,
  resByData = FALSE
)

# Get a look at the output
grid_samples |> head() |> kable()


## -----------------------------------------------------------------------------
#| label: tbl-gridding_ex2
check_res_1 <- grid_samples |>
  summarise(
    n_cell = n_distinct(cell),
    n_aID = n_distinct(assemblageID),
    res = "res12",
    .by = c(STUDY_ID, StudyMethod)
  )

check_res_1 |> head(10) |> kable()

## -----------------------------------------------------------------------------
#| label: fig-gridding_ex
# How many samples were there in each study?
ggplot(data = check_res_1) +
  geom_bar(
    mapping = aes(x = as.character(STUDY_ID), y = n_aID, fill = res),
    stat = "identity"
  ) +
  scale_fill_discrete(type = c("#155f49")) +
  xlab("StudyID") +
  ylab("Number of assemblages in a study") +
  theme_bw() +
  theme(legend.position = "none")


## -----------------------------------------------------------------------------
#| label: gridding_ex2
#| results: hold
# define an alternative resolution of 14 (~10.7 km2 cells)
grid_samples_14 <- gridding(
  meta = BTsubset_meta,
  btf = BTsubset_data,
  res = 14,
  resByData = FALSE
)

# allow the spatial extent of the data to define the resolution
grid_samples_auto <- gridding(
  meta = BTsubset_meta,
  btf = BTsubset_data,
  res = 12,
  resByData = TRUE
)
# this option also returns a message with the automatically picked resolution:


## -----------------------------------------------------------------------------
#| label: fig-gridding_ex3
check_res_2 <- grid_samples_14 |>
  summarise(
    n_cell = n_distinct(cell),
    n_aID = n_distinct(assemblageID),
    res = "res14",
    .by = c(StudyMethod, STUDY_ID)
  )

check_res_3 <- grid_samples_auto |>
  summarise(
    n_cell = n_distinct(cell),
    n_aID = n_distinct(assemblageID),
    res = "res15",
    .by = c(StudyMethod, STUDY_ID)
  )

checks <- rbind(check_res_1, check_res_2, check_res_3)

ggplot(data = checks) +
  geom_bar(
    mapping = aes(x = as.character(STUDY_ID), y = n_aID, fill = res),
    stat = "identity",
    position = "dodge"
  ) +
  scale_fill_discrete(type = c("#155f49", "#66c1d1", "#d9d956")) +
  xlab("StudyID") +
  ylab("Number of assemblages in a study") +
  theme_bw()


## -----------------------------------------------------------------------------
#| label: fig-gridding_map
## The following example is built on the demonstrations in
## https://cran.r-project.org/web/packages/dggridR/vignettes/dggridR.html.

# First we build the ~96 km2 global grid
dgg_12 <- dggridR::dgconstruct(res = 12)

# To simplify, we only map the grid cell boundaries for cells which
# have observations.
# NOTE: if you are working with the full BioTIME database, this step may take some time.
grid_12 <- dggridR::dgcellstogrid(dgg_12, as.integer(grid_samples$cell))

# Now let's follow the same steps and build a ~10.7 km2 global grid:
dgg_14 <- dggridR::dgconstruct(res = 14)
grid_14 <- dggridR::dgcellstogrid(dgg_14, as.integer(grid_samples_14$cell))

# And we get some polygons for each country of the world, to create a background:
countries <- ggplot2::map_data("world")

# Now you could map the whole world, but let's just zoom in the UK and have a look at
# STUDY 466 (Marine Fish):
map_uk_locations <- ggplot() +
  geom_polygon(data = countries, aes(x = long, y = lat, group = group), fill = NA, color = "grey") +
  geom_sf(data = grid_12, aes(), color = alpha("blue", 0.4)) +
  coord_sf(xlim = c(-20, 10), ylim = c(50, 60)) +
  geom_rect(aes(xmin = -11, xmax = -0.7, ymin = 57.2, ymax = 59), colour = "red", fill = NA) +
  labs(x = NULL, y = NULL) +
  theme_bw() + theme(text = element_text(size = 8))

zoom_in_map <- ggplot() +
  geom_polygon(data = countries, aes(x = long, y = lat, group = group), fill = NA,
               color = "grey") +
  geom_sf(data = grid_12, aes(), color = alpha("blue", 0.4)) +
  geom_sf(data = grid_14, aes(), color = alpha("red", 0.4)) +
  coord_sf(xlim = c(-11, -0.7), ylim = c(57.2, 59)) +
  theme_bw()

grid::grid.newpage()
main <- grid::viewport(width = 1, height = 1, x = 0.5, y = 0.5)  # the main map
inset <- grid::viewport(width = 0.4, height = 0.4, x = 0.82, y = 0.45)  # the inset in bottom left

# The resulting distribution of different size cells appears as follows:
print(zoom_in_map, vp = main)
print(map_uk_locations, vp = inset)


## -----------------------------------------------------------------------------
#| label: tbl-resampling_ex
# First, if you are not sure you need this step,
# you can always check how many samples there are in every year of the different time series:
check_samples <- grid_samples |>
  summarise(
    n_samples = n_distinct(SAMPLE_DESC),
    n_species = n_distinct(Species),
    .by = c(STUDY_ID, assemblageID, YEAR)
  )

check_samples |> head(10) |> kable()


## -----------------------------------------------------------------------------
#| label: resampling_ex1
# Let's apply resampling() then, using the data frame of the gridded data:
grid_rare <- resampling(
  x = grid_samples,
  measure = "ABUNDANCE",
  resamps = 1,
  conservative = FALSE
)


## -----------------------------------------------------------------------------
#| label: tests1
#| include: false
# Let's apply it then:

#g rid_samples_temp <- subset(grid_samples, !grid_samples$BIOMASS==0) #to be deleted after Faye reviews the data and makes all 0=NAs
# grid_rare <- resampling( x= grid_samples_test, measure ="BIOMASS", resamps = 1, conservative = FALSE)


## -----------------------------------------------------------------------------
#| label: resampling_ex12
# Keep only observations with both abundance and biomass
grid_rare_ab <- resampling(
  x = grid_samples,
  measure = c("ABUNDANCE", "BIOMASS"),
  resamps = 1,
  conservative = FALSE
)

# Keep only sampling events where all observations within had both abundance and biomass to start with
grid_rare_abT <- resampling(
  x = grid_samples,
  measure = c("ABUNDANCE", "BIOMASS"),
  resamps = 1,
  conservative = TRUE)


## -----------------------------------------------------------------------------
#| label: fig-resampling_ex3
# What is the number of samples in the year with the lowest sampling effort?
ggplot(data = check_samples[check_samples$assemblageID == "18_335699",], aes(x = YEAR, y = n_samples)) +
  geom_col(aes(x = YEAR, y = n_samples), fill = "red", alpha = 0.5) +
  geom_segment(aes(x = 1926, y = min(n_samples) + 3,
                   xend = 1927, yend = min(n_samples)),
               arrow = arrow(length = unit(0.2, "cm"))) +
  xlab("Year") +
  ylab("Number of samples") +
  theme_bw()

# In this case,the year 1927 had the lowest sampling effort, with 3 samples (arrow).


## -----------------------------------------------------------------------------
#| label: tbl-resampling_ex4
# Let's implement the sample-based rarefaction by resampling the dataset 10 times.
grid_rare_n10 <- resampling(
  x = grid_samples,
  measure = "ABUNDANCE",
  resamps = 10,
  conservative = FALSE
)

# Note that you may want to resample many more times (e.g. at least 30-100+ times, but up to 199
# if e.g. working with the whole BioTIME data), depending on how many iterations you want a
# subsequent bootstrap analysis to have.
# This may also take some computation time, so if you are working with a big subset of BioTIME
# is advisable to break it down in smaller subsets.

# Each resampling iteration will be identified as resamp = 1:n, in this case 1:10.
# Now we can check if there are differences across the first 3 of these iterations:

check_resamps <- grid_rare_n10[grid_rare_n10$resamp < 4, ] |>
  summarise(
    n_obs = n(),
    n_species = n_distinct(Species),
    n_year = n_distinct(YEAR),
    .by = c(STUDY_ID, assemblageID, resamp)
  )

check_resamps |> head(10) |> kable()


## -----------------------------------------------------------------------------
#| label: metrics
# Get alpha metrics estimates:
alpha_metrics <- getAlphaMetrics(x = grid_rare, measure = "ABUNDANCE")
# see also help("getAlphaMetrics") for more details on the metrics

# Have a quick look at the output
alpha_metrics |> head(6) |> kable()

# Get beta metrics estimates:
beta_metrics <- getBetaMetrics(x = grid_rare, measure = "ABUNDANCE")
#see also help("getBetaMetrics") for more details on the metrics

# Have a quick look at the output
beta_metrics |> head(6) |> kable()
# NOTE the functions used the rarefied data with only one resampling iteration


## -----------------------------------------------------------------------------
#| label: tests2
#| include: false
# # Get alpha metrics estimates:
# alpha_metrics <- getAlphaMetrics(x = grid_rare_ab, measure = "BIOMASS")
# #see also help("getAlphaMetrics") for more details on the metrics
#
# #Have a quick look at the output
# alpha_metrics |> head(6) |> kable()
#
# # Get beta metrics estimates:
# beta_metrics <- getBetaMetrics(x = grid_rare_ab, measure = "BIOMASS")
# #see also help("getBetaMetrics") for more details on the metrics
#
# #Have a quick look at the output
# beta_metrics |> head(6) |> kable()


## -----------------------------------------------------------------------------
#| label: tbl-trends1
# Let's apply it then:
alpha_slopes <- getLinearRegressions(
  x = alpha_metrics,
  pThreshold = 0.05
) #for alpha metrics

# Have a quick look at the output
alpha_slopes |> head(6) |> kable()

## -----------------------------------------------------------------------------
#| label: tbl-trends2
# Let's apply it then:
beta_slopes <- getLinearRegressions(
  x = beta_metrics,
  pThreshold = 0.05
) #for beta metrics

# Have a quick look at the output
beta_slopes |> head(6) |> kable()


## -----------------------------------------------------------------------------
#| label: tbl-trends3
#| fig-width: 10
#| fig-height: 7
# First, how many assemblages in our dataset show a moderate evidence (P < 0.05) of change in alpha diversity?
alpha_slopes |>
  filter(significance == 1) |>   # or use filter(pvalue<0.05)
  summarise(n_sig = n_distinct(assemblageID), mean(slope), .by = metric) |>
  kable()

# We can see that only a few (<40) of the assemblage time series actually show a significant
# trend of change over time, independently of the metric used. This indicates that in most
# time series in the studies we analysed alpha diversity is not really changing through time.


## -----------------------------------------------------------------------------
#| label: fig-trends
#| fig-width: 7.2
#| fig-height: 5
# Get a slope per assemblageID and metric
alpha_slopes_simp <- alpha_slopes |>
  filter(significance == 1) |> #select only the assemblages with significant trends
  summarise(slope = unique(slope), .by = c(assemblageID, metric, pvalue))

# Calculate the mean slope and CIs
stats_alpha <- alpha_slopes_simp |>
  summarise(
    mean_slope = mean(slope), #mean
    ci_slope = qt(0.95, df = length(slope) - 1) *
      (sd(slope, na.rm = TRUE) / sqrt(length(slope))),
    .by = metric
  ) #margin of error

# Let's put it all together
ggplot(data = alpha_slopes_simp) +
  geom_histogram(aes(x = slope, fill = metric), bins = 25) +
  #geom_density(alpha=0.5)+ #in case you want a density plot instead
  geom_vline(
    aes(xintercept = 0),
    linetype = 3,
    colour = "grey",
    linewidth = 0.5
  ) + #add slope=0 line
  geom_vline(
    data = stats_alpha,
    aes(xintercept = mean_slope),
    linetype = 1,
    linewidth = 0.5,
    colour = "black"
  ) + #mean
  geom_vline(
    data = stats_alpha,
    aes(xintercept = mean_slope - ci_slope),
    linetype = 2,
    linewidth = 0.5,
    colour = "black"
  ) + #lower confidence interval
  geom_vline(
    data = stats_alpha,
    aes(xintercept = mean_slope + ci_slope),
    linetype = 2,
    linewidth = 0.5,
    colour = "black"
  ) + #upper confidence interval
  facet_wrap(~metric, scales = "free") +
  scale_fill_biotime() + #using the customize BioTIME colour scale. See help("scale_color_biotime") for more options.
  ggtitle("Alpha diversity change") +
  theme_bw() +
  theme(
    legend.position = "none",
    plot.title = element_text(size = 11, hjust = 0.5)
  )


## -----------------------------------------------------------------------------
#| label: fig-trends2
#| fig-width: 7.2
#| fig-height: 1.9

# Get a slope per assemblageID and metric
beta_slopes_simp <- beta_slopes |>
  filter(significance == 1) |> #select only the assemblages with significant trends
  summarise(slope = unique(slope), .by = c(assemblageID, metric, pvalue))

# Calculate the mean slope and CIs
stats_beta <- beta_slopes_simp |>
  summarise(
    mean_slope = mean(slope), #mean
    ci_slope = qt(0.95, df = length(slope) - 1) *
      (sd(slope, na.rm = TRUE) / sqrt(length(slope))),
    .by = metric
  ) #margin of error

# Let's put it all together
ggplot(data = beta_slopes_simp) +
  geom_histogram(aes(x = slope, fill = metric), bins = 25) +
  #geom_density(alpha=0.5)+ #in case you want a density plot instead
  geom_vline(
    aes(xintercept = 0),
    linetype = 3,
    colour = "grey",
    linewidth = 0.5
  ) + #add slope=0 line
  geom_vline(
    data = stats_beta,
    aes(xintercept = mean_slope),
    linetype = 1,
    linewidth = 0.5,
    colour = "black"
  ) + #mean
  geom_vline(
    data = stats_beta,
    aes(xintercept = mean_slope - ci_slope),
    linetype = 2,
    linewidth = 0.5,
    colour = "black"
  ) + #lower confidence interval
  geom_vline(
    data = stats_beta,
    aes(xintercept = mean_slope + ci_slope),
    linetype = 2,
    linewidth = 0.5,
    colour = "black"
  ) + #upper confidence interval
  facet_wrap(~metric, scales = "free") +
  scale_fill_biotime() + #using the customize BioTIME colour scale. See help("scale_color_biotime") for more options.
  ggtitle("Beta diversity change") +
  theme_bw() +
  theme(
    legend.position = "none",
    plot.title = element_text(size = 11, hjust = 0.5)
  )


## -----------------------------------------------------------------------------
#| label: trends5
# Hint: If you wish to plot all metrics together, simply merge you alpha_slopes and
# beta_slopes dataframes beforehand.


## -----------------------------------------------------------------------------
#| label: fig-trends6
#| fig-width: 7.2
#| fig-height: 2.3
# First, we need to check the meta file and retrieve the information for the studies of interest
#head(BTsubset_meta)
meta <- select(BTsubset_meta, STUDY_ID, TAXA, REALM, CLIMATE)

# Get a slope per assemblageID and metric
alpha_slopes_simp <- alpha_slopes |>
  summarise(
    slope = unique(slope),
    .by = c(assemblageID, metric, pvalue, significance)
  )

# Get back the Study ID by separating our assemblageID column into multiple other columns.
alpha_slopes_simp <- alpha_slopes_simp |>
  separate_wider_delim(
    cols = assemblageID,
    names = c("STUDY_ID", "cell"),
    delim = "_",
    cols_remove = FALSE
  ) |>
  as.data.frame()

# Merge it all
alpha_slopes_meta <- merge(alpha_slopes_simp, meta, by = "STUDY_ID")

# Select relevant data for plotting
alpha_slopes_set1 <- subset(
  alpha_slopes_meta,
  alpha_slopes_meta$metric == "Shannon" &
    (alpha_slopes_meta$TAXA == "Fish" | alpha_slopes_meta$TAXA == "Birds")
)

# Let's put it all together
ggplot(data = alpha_slopes_set1, aes(x = slope, fill = TAXA)) +
  geom_histogram(fill = "grey", bins = 25) + #all assemblages
  geom_histogram(
    data = alpha_slopes_set1[alpha_slopes_set1$significance == 1, ],
    bins = 25
  ) + #only significant change assemblages
  geom_vline(
    aes(xintercept = 0),
    linetype = 3,
    colour = "black",
    linewidth = 0.5
  ) + #add slope=0 line
  facet_wrap(~TAXA, scales = "free") +
  scale_fill_biotime() +
  ggtitle("Shannon-Weiner Species Diversity Index") +
  theme_bw() +
  theme(plot.title = element_text(size = 11, hjust = 0.5))


## -----------------------------------------------------------------------------
#| label: trends7
# Now we see the full distribution of slopes of change for the Shannon index for the birds
# and fish time series in our subset: all slopes are shown in grey, while in color we show
# the subset of assemblages for which evidence (P < 0.05) of change was detected.


## -----------------------------------------------------------------------------
#| label: fig-trends8
#| fig-width: 7.2
#| fig-height: 5

# Let's go back to the data frame with the yearly values and select our relevant data for plotting
alpha_metrics_set <- subset(alpha_metrics, assemblageID == "18_335699")

# Transform data
alpha_metrics_set_long <- pivot_longer(alpha_metrics_set,
                                   cols = c("S", "N", "maxN", "Shannon", "expShannon",
                                   "Simpson", "invSimpson", "PIE", "DomMc"),
                                   names_to = "metric",
                                   names_transform = as.factor)

# Get assemblage ID slope of change
alpha_slopes_set2 <- subset(alpha_slopes, assemblageID == "18_335699")

# Get assemblage ID
name_assemblage <- unique(alpha_slopes_set2$assemblageID)

# Merge info
alpha_metr_slop<- left_join(
  x = alpha_slopes_set2,
  y = alpha_metrics_set_long,
  by = join_by(assemblageID, metric))

# Let's put it all together
ggplot(data = alpha_metr_slop, aes(x = YEAR, y = value)) +
    geom_point(colour = "#155f49", size = 1) +   #plot the yearly estimates
    stat_smooth(method = "lm", se = FALSE, linetype = 2, colour = "grey") + #draw all regression line
    stat_smooth(data = subset(alpha_metr_slop, alpha_metr_slop$significance == 1),
                aes(x = YEAR, y = value), method = "lm", se = FALSE,
                linetype = 2, colour = "#155f49") + #color only trends that are significant
    facet_wrap(~metric, scales = "free") +
    scale_fill_biotime() +
    ggtitle(paste("Assemblage", name_assemblage)) + ylab("Diversity") +
    theme_bw() +
    theme(plot.title = element_text(size = 11, hjust = 0.5))


## -----------------------------------------------------------------------------
#| label: trends9
#| include: false

# Hint: If you want to draw the p-value on the plot, you can try using the #ggpmisc::stat_fit_glance which takes anything passed through lm() in R and
#allows it to processed and printed using ggplot2.


## -----------------------------------------------------------------------------
#| label: citation
#| cache: false

citation("BioTIMEr")