
#' Trawl a local CRAN archive and extract statistics from all packages
#'
#' @param path Path to local CRAN archive
#' @param archive If `TRUE`, extract statistics for all packages in the
#' `/Archive` sub-directory, otherwise only statistics for main `tarballs`
#' directory (that is, current packages only).
#' @param prev_results Result of previous call to this function, if available.
#' Submitting previous results will ensure that only newer packages not present
#' in previous result will be analysed, with new results simply appended to
#' previous results. This parameter can also specify a file to be read with
#' `readRDS()`.
#' @param results_file Can be used to specify the name or full path of a `.Rds`
#' file to which results should be saved once they have been generated. The
#' '.Rds' extension will be automatically appended, and any other extensions
#' will be ignored.
#' @param chunk_size Divide large archive trawl into chunks of this size, and
#' save intermediate results to local files. These intermediate files can be
#' combined to generate a single `prev_results` file, to enable jobs to be
#' stopped and re-started without having to recalculate all results. These files
#' will be named `pkgstats-results-N.Rds`, where "N" incrementally numbers each
#' file.
#' @param save_full If `TRUE`, full \link{pkgstats} results are saved for each
#' package to files in `results_path`.
#' @param results_path Path to save intermediate files generated by the
#' `chunk_size` parameter described above.
#'
#' @note Each analysis in an archive trawl spawns several \emph{unsupervised}
#' processes, preventing the trawl from running in parallel. Accurate results
#' can only be guaranteed by running this function as a single process.
#'
#' @return A `data.frame` object with one row for each package containing
#' summary statistics generated from the \link{pkgstats_summary} function.
#'
#' @family archive
#' @export
pkgstats_from_archive <- function (path,
                                   archive = TRUE,
                                   prev_results = NULL,
                                   results_file = NULL,
                                   chunk_size = 1000L,
                                   save_full = FALSE,
                                   results_path = tempdir ()) {

    requireNamespace ("hms")
    requireNamespace ("pbapply")

    if (!grepl ("tarball", path)) {
        if (!dir.exists (file.path (path, "tarballs"))) {
            stop ("path must contain a 'tarballs' directory")
        }
        path <- file.path (path, "tarballs")
    }

    path_last <- utils::tail (decompose_path (path) [[1]], 1L)
    if (path_last != "tarballs") {
        stop ("path must be a directory named 'tarballs'")
    }

    if (!dir.exists (path)) {
        stop ("[", path, "] directory does not exist")
    }

    res <- e <- NULL
    out <- prev_results

    flist <- list.files (
        path,
        recursive = archive,
        full.names = TRUE,
        pattern = "\\.tar\\.gz$"
    )
    flist <- normalizePath (flist)
    flist <- rm_prev_files (flist, prev_results)
    nfiles <- length (flist)

    if (nfiles > 0) {

        n <- ceiling (nfiles / chunk_size)
        n <- factor (rep (seq (n), each = chunk_size)) [seq (nfiles)]
        flist <- split (flist, f = n)

        message (
            "Starting trawl of ", nfiles,
            " files in ", length (flist), " chunks"
        )

        results_path <- normalizePath (results_path)
        results_files <- NULL

        index <- 1 # name of temporary files
        pt0 <- proc.time ()

        for (f in flist) {

            res <- pbapply::pblapply (f, function (i) {

                s <- tryCatch (pkgstats::pkgstats (i),
                    error = function (e) NULL
                )

                if (save_full) {
                    pkg <- utils::tail (decompose_path (i) [[1]], 1L)
                    pkg <- gsub ("\\.tar\\.gz$", "", pkg)
                    saveRDS (s, file.path (results_path, pkg))
                }

                summ <- tryCatch (pkgstats::pkgstats_summary (s),
                    error = function (e) NULL
                )
                if (is.null (summ)) { # pkgstats failed
                    summ <- pkgstats_summary () # null summary
                    p <- strsplit (i, .Platform$file.sep) [[1]]
                    p <- strsplit (utils::tail (p, 1), "\\_") [[1]]
                    summ ["package"] <- p [1]
                    summ ["version"] <-
                        gsub ("\\.tar\\.gz$", "", p [2])
                }
                return (summ)
            })

            fname <- file.path (
                results_path,
                paste0 ("pkgstats-results-", index, ".Rds")
            )
            saveRDS (do.call (rbind, res), fname)
            results_files <- c (results_files, fname)

            prog <- index * chunk_size / nfiles
            prog_fmt <- format (100 * prog, digits = 2)
            pt1 <- as.integer ((proc.time () - pt0) [3])
            t_per_file <- pt1 / (index * chunk_size)
            t_total <- t_per_file * nfiles
            t_rem <- hms::hms (t_total - pt1)

            ndone <- min (c (nfiles, index * chunk_size))

            message (
                "[", ndone, " / ", nfiles,
                "]  = ", prog_fmt, "%; (elapsed, remaining) = (",
                pt1, ", ", t_rem, ")"
            )

            index <- index + 1
        }

        res <- do.call (rbind, lapply (results_files, readRDS))
    }

    out <- rbind (out, res)
    rownames (out) <- NULL

    chk <- file.remove (results_files) # nolint

    if (!is.null (res) & !is.null (results_file)) {

        if (!grepl (.Platform$file.sep, results_file)) {
            results_file <- file.path (".", results_file)
        }
        results_file <- normalizePath (results_file, mustWork = FALSE)

        results_path <- gsub (
            basename (results_file), "",
            results_file
        )
        results_path <- normalizePath (results_path)
        if (!dir.exists (results_path)) {
            stop ("Directory [", results_path, "] does not exist")
        }

        results_file <- basename (results_file)
        results_file <- tools::file_path_sans_ext (results_file)
        results_file <- file.path (
            results_path,
            paste0 (results_file, ".Rds")
        )

        saveRDS (out, results_file)
    }

    invisible (out)
}

#' Remove files for which results have already been generated
#' @param flist Full paths to all tarball files to be analysed
#' @param prev_results `data.frame` of previous results
#' @return Modified version of `flist`, after removing any entires present in
#' `prev_results`.
#' @noRd
rm_prev_files <- function (flist, prev_results) {

    if (!is.null (prev_results)) {

        if (is.character (prev_results)) {
            if (length (prev_results) > 1) {
                stop ("prev_results must be a single-length character")
            }
            if (!file.exists (prev_results)) {
                stop ("file [", prev_results, "] does not exist")
            }
            prev_results <- tryCatch (readRDS (prev_results),
                error = function (e) e
            )
            if (methods::is (prev_results, "error")) {
                stop ("Unable to read prev_results: ", prev_results$message)
            }
        }

        tars <- vapply (
            flist, function (i) {
                utils::tail (strsplit (i, .Platform$file.sep) [[1]], 1)
            },
            character (1)
        )

        prev_tars <- paste0 (
            prev_results$package,
            "_",
            prev_results$version,
            ".tar.gz"
        )

        flist <- flist [which (!tars %in% prev_tars)]
    }

    return (flist)
}
