## ----setup, include=FALSE-----------------------------------------------------
cran <- identical(tolower(Sys.getenv("NOT_CRAN")), "false")
if (cran || !curl::has_internet()) {
  knitr::opts_chunk$set(eval = FALSE, collapse = TRUE, comment = "#>")
} else {
  knitr::opts_chunk$set(eval = TRUE, collapse = TRUE, comment = "#>")
}


## ----librarydata--------------------------------------------------------------
library(specleanr)

## ----dataprocessing, fig.width = 6, fig.height= 4, fig.align='center'---------

data(efidata) 

data(jdsdata) 

danube <- sf::st_read(system.file('extdata', "danube.shp.zip",
                                  package = 'specleanr'), quiet=TRUE)


df_online <- getdata(data = c("Squalius cephalus", 'Salmo trutta',"Thymallus thymallus"),
                     extent = danube,
                     gbiflim = 50,
                     inatlim = 50,
                     vertlim = 50,
                     verbose = FALSE)


mergealldfs <- match_datasets(datasets = list(efi= efidata, jds = jdsdata,
                                              onlinedata = df_online),
                              country = c('JDS4_sampling_ID'),
                              lats = 'lat', lons = 'lon',
                              species = c('speciesname', 'scientificName'))
#Cleaning data

cleannames_df <- check_names(data = mergealldfs, colsp = 'species', pct = 90,
                             merge = TRUE, verbose = FALSE)

spfilter <- cleannames_df[cleannames_df$speciescheck %in%
                                   c("Squalius cephalus", 'Salmo trutta',
                                     "Thymallus thymallus","Anguilla anguilla", 
                                     'Barbatula barbatula'),]

worldclim <- terra::rast(system.file('extdata/worldclim.tiff', package = 'specleanr'))

#Get basin shapefile to delineate the study region: optional

danube <- sf::st_read(system.file('extdata', 'danube.shp.zip',
                                  package = 'specleanr'), quiet=TRUE)

## ----outlierdetoptplot, fig.width = 6, fig.height= 4.5, fig.align='center', dpi=120----
parm <- par(mfrow = c(2, 2),
    mar = c(3,3, 1.5, 0.5),
    oma = c(0, 0, 0, 0),
    mgp = c(1.7, 0.8, 0)
)

spp <- unique(spfilter$speciescheck)

pltout <- lapply(spp, function(s){
  
  spout <- spfilter[spfilter[,'speciescheck'] %in%s,]
  
  refdata <-  pred_extract(data= spout, raster= worldclim,
                           lat = 'decimalLatitude',
                           lon = 'decimalLongitude',
                           colsp = 'speciescheck',
                           bbox  = danube,
                           list= TRUE,
                           minpts = 10)
  
  outdet <- multidetect(data = refdata, multiple = FALSE,
                        var = 'bio6', output = 'outlier',
                        exclude = c('x','y'),
                        methods = c('zscore', 'adjbox',
                                    'logboxplot', 'distboxplot',
                                    'iqr', 'semiqr','seqfences',
                                    'hampel','kmeans',
                                    'jknife', 'onesvm',
                                    'iforest'), 
                        warn = FALSE)
  print(nrow(refdata))
  
  opt <- optimal_threshold(refdata = refdata, outliers = outdet, 
                             plotsetting = list(plot = TRUE, group = s))
  opt
  
})
par(parm)

## ----simulated data-----------------------------------------------------------
set.seed(113554333)
a <- rnorm(30, 32, 1)
b <- rnorm(30, 4, 1)
c <- rnorm(30, 0, 1)
d <- rnorm(30, 6, 1)
#add outlier rows
out <- c(409, 43, 76, 23)
out1 <- c(-0.2409, 10, 43, 22)
out2 <- c(1509, 0.43, 76, 23)

df <- data.frame(a, b, c, d)

df2 <- rbind(df, out, out1, out2)


## ----outlier detection--------------------------------------------------------
outdet2 <- multidetect(data = df2, multiple = FALSE,
                      var = 'a', output = 'outlier',
                      methods = c('zscore', 'adjbox',
                                  'logboxplot', 'distboxplot',
                                  'iqr', 'semiqr','seqfences',
                                  'hampel','kmeans',
                                  'jknife', 'onesvm',
                                  'iforest'), 
                      warn = FALSE)

## ----visualise data, fig.width = 6, fig.height= 4, fig.align='center'---------
par(mar = c(3, 3, 1.5, 1.5))

opt1 <- optimal_threshold(refdata = df2, 
                          outliers = outdet2, 
                         plotsetting = list(plot = TRUE))
opt1



## ----check for the outlier weights and data cleaning--------------------------
#get the weights for the flagged records

weights <- ocindex(x = outdet2, absolute = TRUE, props = TRUE, threshold = 0.1, warn = FALSE)

print(weights)

dfclean <- extract_clean_data(refdata = df2, outliers = outdet2, loess = TRUE)

print(dfclean)