#!/usr/bin/env python
import copy
import logging
import math
import os.path
import sqlite3
from collections import deque
from dataclasses import dataclass, field
from typing import Optional, List
import pandas as pd

from nsys_cpu_stats.trace_processor import TraceProcessor, TraceLoaderType
from nsys_cpu_stats.trace_loader import TraceLoaderRegions, TraceLoaderSupport, TraceLoaderGPUMetrics
import nsys_cpu_stats.trace_utils as tu

logger = logging.getLogger(__name__)


def sort_on_start(e):
    return e.start


def sort_on_end(e):
    return e.end


@dataclass
class CPUThreadingHealth:
    health_string: str = ""
    issue_string: str = ""
    health: float = 0
    unit: str = ""
    flag: bool = False
    black_flag_min: float = 0
    black_flag_max: float = 0

    # Not needed in init, computed in post_init
    black_flag: bool = field(init=False)

    def __post_init__(self):
        if self.flag:
            self.black_flag = self.black_flag_min <= self.health <= self.black_flag_max


@dataclass
class CPUInfo:
    def __init__(self):
        self.core_count = 0
        self.p_core_count = 0
        self.p_smt_core_count = 0
        self.e_core_count = 0
        self.e_smt_core_count = 0

    def get_dict(self) -> dict:
        return self.__dict__


class CPUThreadingStatistics:
    def __init__(self):
        self.process_name = ""
        self.start_time = 0
        self.end_time = 0

        self.total_thread_count = 0
        self.active_thread_count = 0
        self.job_thread_count = 0
        self.serial_thread_count = 0

        self.total_utilisation = 0
        self.busiest_thread_util = 0
        self.median_job_thread_util = 0
        self.serial_work_util = 0
        self.concurrency_of_active_threads = 0

        self.idle_time = 0
        self.one_thread_active = 0
        self.two_threads_active = 0

        self.average_cpu_frametime_ms = 0
        self.average_gpu_frametime_ms = 0

        self.gpu_utilisation = 0

    def is_cpu_bound(self):
        cpu_bound = (self.idle_time < 0.05) or (0 < self.gpu_utilisation < .95)
        if self.average_cpu_frametime_ms and self.average_gpu_frametime_ms:
            cpu_bound = self.average_cpu_frametime_ms >= (self.average_gpu_frametime_ms + 0.5)  # add 0.5ms for present time
        return cpu_bound

    def print_stats(self):
        logger.info(f"""
            Average CPU Threading Statistics ({self.process_name})
            {"=" * 50}
            {self.start_time / 1000000000.0:<6.2f} ms : {"Start Time":<50}
            {self.end_time / 1000000000.0:<6.2f} ms : {"End Time":<50}
            {self.average_cpu_frametime_ms:<6.2f} ms : {"Average CPU Frametime":<50}
            {self.average_gpu_frametime_ms:<6.2f} ms : {"Average GPU Frametime":<50}
            {self.total_thread_count:<6}    : {"Total Thread Count":<50}
            {self.active_thread_count:<6}    : {"Active Thread Count":<50}
            {self.job_thread_count:<6}    : {"Worker Thread Count":<50}
            {self.serial_thread_count:<6}    : {"Main Thread Count (render + game...)":<50}
            {self.total_utilisation * 100:<6.2f} %  : {"CPU Core Utilisation":<50}
            {self.busiest_thread_util * 100:<6.2f} %  : {"Busiest Thread Utilisation":<50}
            {self.median_job_thread_util * 100:<6.2f} %  : {"Median Job Thread Utilisation":<50}
            {self.serial_work_util * 100:<6.2f} %  : {"Serial Work (Job threads not running)":<50}
            {self.idle_time * 100:<6.2f} %  : {"All Threads Idle":<50}
            {self.one_thread_active * 100:<6.2f} %  : {"Only 1 Thread Active":<50}
            {self.two_threads_active * 100:<6.2f} %  : {"Only 2 Threads Active":<50}
            {self.concurrency_of_active_threads * 100:<6.2f} %  : {"All Active Threads Running Concurrently":<50}
            {self.gpu_utilisation * 100:<6.2f} %  : {"Average GPU Utilisation":<50}
            {"=" * 50}'
            """)

    def get_stats_as_dict(self) -> dict:
        return self.__dict__

    def sanitise_stats_dict(self) -> dict:
        d = copy.deepcopy(self.get_stats_as_dict())
        d["start_time_(s)"] = d.pop("start_time") / 1000000000
        d["end_time_(s)"] = d.pop("end_time") / 1000000000

        d["average_cpu_frametime_(ms)"] = d.pop("average_cpu_frametime_ms")
        d["average_gpu_frametime_(ms)"] = d.pop("average_gpu_frametime_ms")

        d["cpu_idle_(%)"] = d.pop("idle_time") * 100
        d["gpu_idle_(%)"] = (1 - d.pop("gpu_utilisation")) * 100

        d["total_thread_utilisation_(%)"] = d.pop("total_utilisation") * 100
        d["busiest_thread_util_(%)"] = d.pop("busiest_thread_util") * 100
        d["median_job_thread_util_(%)"] = d.pop("median_job_thread_util") * 100
        d["serial_work_util_(%)"] = d.pop("serial_work_util") * 100
        d["one_thread_active_(%)"] = d.pop("one_thread_active") * 100
        d["two_threads_active_(%)"] = d.pop("two_threads_active") * 100

        return d

    def run_cpu_threading_health_check(self, cpu_info: CPUInfo) -> List:
        # Flags
        threading_health_list = []

        total_core_count = cpu_info.core_count

        # Until we have better support, always assume SMT is enabled
        if cpu_info.p_core_count:
            physical_core_count = cpu_info.p_core_count + cpu_info.e_core_count
        else:
            physical_core_count = total_core_count / 2

        # CPU Bound
        cpu_bound = self.is_cpu_bound()  # self.average_cpu_frametime_ms > (self.average_gpu_frametime_ms * 1.02)

        threading_health_list.append(CPUThreadingHealth("CPU Bound", "Reduce work from the critical thread.", int(self.is_cpu_bound()), "bool", self.is_cpu_bound(), 0, 1))

        # Report on GPU idle time. Anything > 20% is a black flag
        gpu_idle_time = (1 - self.gpu_utilisation) * 100
        if self.gpu_utilisation == 0:
            threading_health_list.append(CPUThreadingHealth("Average GPU Idle Time per frame", "Collection Error! No frametime data found.", 0, "%", True, 0, 100))
        else:
            threading_health_list.append(CPUThreadingHealth("Average GPU Idle Time per frame", "CPU Bound", gpu_idle_time, "%", gpu_idle_time > 5 and cpu_bound, 20, 100))

        # Report on CPU idle - when no CPU threads are running. Black flag if > 5%
        threading_health_list.append(CPUThreadingHealth("Average Application CPU Idle Time per frame", "CPU idle/underutilized - check synchronisation primitives (GPU/CPU)!", self.idle_time * 100, "%", (self.idle_time * 100) > 2 and cpu_bound, 5, 100))

        # Report on CPU idle - when no CPU threads are running. Black flag if > 10%
        idle_time = (1 - self.busiest_thread_util) * 100
        threading_health_list.append(CPUThreadingHealth("Idle Primary Thread", "Main thread stalling - check synchronisation primitives (CPU/CPU and GPU/CPU)!", idle_time, "%", idle_time > 5 and cpu_bound, 10, 100))

        # If the worker threads are idle and the game is CPU bound, then more work should be moved to the job system.  Black flag if >60% idle
        idle_worker_threads = (1 - self.median_job_thread_util) * 100
        threading_health_list.append(CPUThreadingHealth("Idle Job System", "Large amount of the frame with idle worker threads! Try moving more work to the job system.", idle_worker_threads, "%", idle_worker_threads > 40 and cpu_bound, 60, 100))

        # The difference between the busiest thread and the median job thread suggests the amount of time the process is serial. Black flag if > 50% and cpu bound
        threading_health_list.append(CPUThreadingHealth("Serial work", "A large amount of the frame is serial. This should be redistributed to the job system where possible.", self.serial_work_util * 100, "%", self.serial_work_util > 0.3 and cpu_bound, 50, 100))

        # Active thread count
        if total_core_count > 0:
            threading_health_list.append(CPUThreadingHealth("Active Thread Count", "The workload uses more active threads (>1% util) than total logical CPU cores. Over subscription may negatively impact performance. Experiment reducing the thread count.", self.active_thread_count, "", self.active_thread_count > total_core_count and cpu_bound, total_core_count, 1000))
            threading_health_list.append(CPUThreadingHealth("Job Thread Count", "The workload uses more job threads (>1% util) than physical cores which may cause a job thread to share a core with a critical thread. Hyper-threading may negatively impact performance. Experiment reducing the thread count.", self.job_thread_count, "", self.job_thread_count > physical_core_count and cpu_bound, physical_core_count, 1000))
        else:
            threading_health_list.append(CPUThreadingHealth("Active Thread Count", "The workload uses a lot of active threads (>1% util). Over subscription may negatively impact performance. Experiment reducing the thread count.", self.active_thread_count, "", self.active_thread_count > 16 and cpu_bound, 16, 1000))
            threading_health_list.append(CPUThreadingHealth("Job Thread Count", "The workload uses a lot of job threads (>1% util) which may over-subscribe the CPU causing a job thread to share a core with a critical thread. Hyper-threading may negatively impact performance. Experiment reducing the thread count.", self.job_thread_count, "", self.job_thread_count > 8 and cpu_bound, 8, 1000))

        # Time when all active threads are working - ignore concurrency counts < active thread count
        threading_health_list.append(CPUThreadingHealth("Active Thread Concurrency", "Small amount of the frame when all active threads (>1% util) are working concurrently! Check for synchronisation stalls.", self.concurrency_of_active_threads * 100, "%", self.concurrency_of_active_threads < 0.5 and cpu_bound, 0, 0.2))

        active_util = (self.idle_time + self.one_thread_active + self.two_threads_active) * 100
        threading_health_list.append(CPUThreadingHealth("<=2 Concurrent Threads", "Large amount of time when only 2 or less threads are running concurrently! Check for synchronisation stalls.", active_util, "%", active_util > 30 and cpu_bound, 50, 100))

        # Total utilisation is the sum of all thread utilisations so depicts the load. If it is low but still CPU bound, then there could be an issue with threading. Aim for 50% of physical cores
        threading_health_list.append(CPUThreadingHealth("Total Thread Utilisation", "Low total utilisation and cpu bound suggests in-effective threading!", self.total_utilisation * 100, "%",
                                     ((physical_core_count > 0 and self.total_utilisation < physical_core_count / 2) or (self.total_utilisation < 5)) and cpu_bound, 0, 200))

        return threading_health_list

    def print_cpu_threading_health_check(self, threading_health_list):
        self.print_cpu_threading_health_check_notermcolor(threading_health_list)

    def print_cpu_threading_health_check_notermcolor(self, threading_health_list):
        cpu_bound = self.is_cpu_bound()  # self.average_cpu_frametime_ms > (self.average_gpu_frametime_ms * 1.02)
        cpu_bound_str = "CPU Bound"
        if not cpu_bound:
            cpu_bound_str = "GPU Bound"

        logger.info(f"""
            CPU Threading Health Check ({self.process_name}) : {cpu_bound_str}
            {"=" * 50}
            {"Health Value":<16} {"Flag":<10} {"Description":50}   {"Warning":}
            {"-" * 120}""")

        for th in threading_health_list:
            col = "green"
            issue_string = ""
            if th.flag:
                col = "yellow"
                if th.black_flag:
                    col = 'red'
                issue_string = th.issue_string
            logger.info(f'{th.health:>6.2f} {th.unit:8} [{col:6}] : {th.health_string:50} - {issue_string:}')

        logger.info(f'{"=" * 50}\n')

    @staticmethod
    def get_health_check_as_dict(threading_health_list) -> dict:
        d = {"Description": [], "Health Metric": [], "Flag": [], "Warning": []}

        for th in threading_health_list:
            col = "green"
            issue_string = ""
            if th.flag:
                col = "yellow"
                if th.black_flag:
                    col = 'red'
                issue_string = th.issue_string
            if th.unit:
                d["Description"].append(f'{th.health_string} ({th.unit})')
            else:
                d["Description"].append(th.health_string)
            if th.unit == "bool":
                if th.health > 0:
                    d["Health Metric"].append("True")
                else:
                    d["Health Metric"].append("False")
            else:
                d["Health Metric"].append(th.health)
            d["Flag"].append(col)
            d["Warning"].append(issue_string)
        return d


class CPUThreadingInfo:
    """Generates statistics for the CPU"""

    def __init__(self, quiet=False):
        self.tp = TraceProcessor()
        self.stats: Optional[CPUThreadingStatistics] = None
        self.cpu_info: Optional[CPUInfo] = None
        self.concurrent_thread_list: List[float] = []
        self.threading_health_list: List[CPUThreadingHealth] = []
        self.named_thread_util_dict: Optional[dict] = None
        self.process_util_dict: Optional[dict] = None
        self.quiet = quiet

    def init_nsys_database(self, file_in):
        self.tp.init_loader(TraceLoaderType.NSysRep_Loader).init_database(file_in)
        self.tp.set_current_loader(TraceLoaderType.NSysRep_Loader)
        self.tp.init_common()

    def close_nsys_database(self):
        if self.tp.current_loader == TraceLoaderType.NSysRep_Loader:
            self.tp.get_loader(TraceLoaderType.NSysRep_Loader).close_database()

    def __get_thread_counts(self, thread_utilisation_dict, sorted_thread_keys, target_pid):
        active_thread_count = 0
        total_thread_count = 0
        total_utilisation = 0

        active_thread_threshold = 0.01  # 1%

        # Walk the sorted threads inserting the info
        for key in sorted_thread_keys:
            pid, tid = tu.convert_global_tid(key)
            if pid != target_pid:
                continue

            util = thread_utilisation_dict[key]

            if util > active_thread_threshold:
                active_thread_count += 1
            total_thread_count += 1
            total_utilisation += util

        return total_thread_count, active_thread_count, total_utilisation

    ####################################################
    #
    # Calculate the concurrency from the timeslice lists
    #
    ####################################################
    def __get_thread_concurrency(self, timeslice_list, total_time, target_pid):
        d = deque()

        # walk through the timeslices. They are sorted on t.start
        # TODO: do we need this?
        # last_timeslice = len(timeslice_list) - 1
        concurrent_thread_time = [0] * (self.stats.total_thread_count + 2)  # add a little buffer
        concurrent_thread_time[0] = total_time

        timeslice_list.sort(key=sort_on_start)

        # verify it is sorted, otherwise we get negative times
        for ii, t in enumerate(timeslice_list):
            if ii == 0:
                continue
            assert t.start >= timeslice_list[ii - 1].start

        prev = -1
        # The current timeslice we are looking at is (curr.start - prev), unless curr-1 has ended.
        # We essentially keep a Q of active timeslices/threads and at each timeslice intersection
        # we accumulate the duration into the appropriate thread count bucket
        for t in timeslice_list:

            # Only look at slices from our pid
            pid, tid = tu.convert_global_tid(t.gtid)
            if pid != target_pid:
                continue

            # remove a timeslice if it is no longer relevant
            gc = []
            for e in d:
                if e.end < t.start:
                    gc.append(e)

            # sort on ascending end points
            gc.sort(key=sort_on_end)
            for g in gc:
                count = len(d)
                time = g.end - prev
                # stats.total_thread_count is for threads with > 0.1% activity, so there can be mismatches
                if count >= len(concurrent_thread_time):
                    for delta in range(len(concurrent_thread_time), count + 1):
                        concurrent_thread_time.append(0.0)
#                if count > self.stats.total_thread_count:
#                    logger.warning(f"Concurrency check has found too many threads - found: {count}, expecting a maximum of {self.stats.total_thread_count}")
                concurrent_thread_time[count] += time
                concurrent_thread_time[0] -= time
                prev = g.end
                d.remove(g)

            # skip if this is the only thread
            count = len(d)
            if count != 0:
                time = t.start - prev
                # stats.total_thread_count is for threads with > 0.1% activity, so there can be mismatches
                if count >= len(concurrent_thread_time):
                    for delta in range(len(concurrent_thread_time), count + 1):
                        concurrent_thread_time.append(0.0)
                concurrent_thread_time[count] += time
                concurrent_thread_time[0] -= time
                prev = t.start
            else:
                prev = t.start

            # put the current timeslice on the deque
            d.append(t)

        # Remove the ~0% concurrency numbers (to help reporting with apps that have 100's of idle threads)
        rev_concurrent_thread_time = reversed(concurrent_thread_time)
        delete_count = 0
        for ii, t in enumerate(rev_concurrent_thread_time):
            if t < 0.01:
                delete_count += 1
            else:
                break

        display_concurrent_thread_time = concurrent_thread_time
        while delete_count:
            lenght = len(display_concurrent_thread_time)
            del display_concurrent_thread_time[lenght - 1]
            delete_count -= 1

        return display_concurrent_thread_time

    @staticmethod
    def __get_amdahls_law(thread_utilisation_dict, sorted_thread_keys, median_job_thread_util):
        # Amdahls Law
        # S = 1 / (1 - p)
        # p is the parallel part of the workload [0,1]
        # We are looking for the threads in the tasking system as the pll part of the workload
        # So, pick a mid thread and get it's utilisation.
        # If a thread is within 10% util of it, then add it to the pll sum
        # The serial part of the workload are the remaining active threads not in the job system,
        # so typically the game/render threads
        serial_limit = 1.2
        parallel_limit = 1.2
        serial_util = 0
        parallel_util = 0
        serial_thread_count = 0
        job_thread_count = 0

        for key in sorted_thread_keys:
            util = thread_utilisation_dict[key]

            if util > (median_job_thread_util * serial_limit):
                # This is the serial part of the workload
                # although there may actual be more than 1 threads working in parallel here
                serial_util = max(serial_util, util)
                serial_thread_count += 1
            #        serial_util += util
            # Only care about the job threads
            elif util > (median_job_thread_util / parallel_limit):
                parallel_util += util
                job_thread_count += 1
        total_amdahl = serial_util + parallel_util
        serial_util = serial_util / total_amdahl
        parallel_util = parallel_util / total_amdahl
        return total_amdahl, serial_util, parallel_util, serial_thread_count, job_thread_count

    # TODO: cpuframeinfo code duplication
    def __create_named_thread_utilisation_dict(self, thread_utilisation_dict, sorted_thread_keys, target_pid):
        named_thread_util_dict = {}
        # Walk the sorted threads inserting the info
        for key in sorted_thread_keys:
            pid, tid = tu.convert_global_tid(key)
            if pid != target_pid:
                continue

            util = thread_utilisation_dict[key]

            name = self.tp.get_thread_name(key)
            named_thread_util_dict[name] = util

        return named_thread_util_dict

    @staticmethod
    def __create_concurrent_thread_list(display_concurrent_thread_time, total_time):
        concurrent_thread_util = []

        for ii, t in enumerate(display_concurrent_thread_time):
            concurrent_thread_util.append(t / total_time)

        return concurrent_thread_util

    def process_sqldb(self,
                      target_pid: Optional[int] = None,
                      list_pids: Optional[bool] = False,
                      start_time_ns: Optional[float] = None,
                      end_time_ns: Optional[float] = None,
                      cpu_config: Optional[tu.CPUConfig] = None):

        self.tp.clear_dataframes()

        self.stats = CPUThreadingStatistics()
        self.cpu_info = CPUInfo()

        self.cpu_info.core_count = self.tp.get_core_count()

        # If the CPU config is available from nsys, use it.
        config = self.tp.get_cpu_config()
        if config:
            cpu_config = config

        if cpu_config:
            if cpu_config.physical_p_core_count:
                self.cpu_info.p_core_count = cpu_config.physical_p_core_count
            if cpu_config.logical_p_core_count:
                self.cpu_info.p_smt_core_count = cpu_config.logical_p_core_count
            if cpu_config.physical_e_core_count:
                self.cpu_info.e_core_count = cpu_config.physical_e_core_count
            if cpu_config.logical_e_core_count:
                self.cpu_info.e_smt_core_count = cpu_config.logical_e_core_count

            logger.info(f"Detected {cpu_config.logical_p_core_count} P cores and {cpu_config.physical_e_core_count} E cores.")

        duration = 0

        all_timeslice_list = self.tp.get_all_timeslices(quiet=self.quiet)

        if start_time_ns is None and end_time_ns is None:
            # Get the time slices
            start = max(all_timeslice_list[0].start, 0)
            duration = all_timeslice_list[-1].end - start

            ignore_delta = duration / 20
            start_time_ns = start + ignore_delta
            end_time_ns = start + duration - ignore_delta

        timeslice_list = self.tp.filter_timeslices(all_timeslice_list, start_time_ns, end_time_ns)

        duration = end_time_ns - start_time_ns
        self.stats.start_time = start_time_ns
        self.stats.end_time = end_time_ns

        logger.info(f'Processing Time Start/End : {start_time_ns / 1000000000:<0.1f}s --> {end_time_ns / 1000000000:<0.1f}s (total trace duration : {duration / 1000000000:<0.1f}s)')

        ###############################################
        #
        # Thread Utilisation
        #
        ###############################################

        # Process the timeslices to generate thread utilisation
        total_time, thread_utilisation_dict, cpu_thread_utilisation_dict = self.tp.get_thread_utilisation_per_core(timeslice_list, True)

        # Find the main PID, then remove the rest of the threads from the dictionary
        sorted_thread_keys = sorted(thread_utilisation_dict, key=thread_utilisation_dict.get, reverse=True)

#        if main_pid is None or list_pids:
        main_pid, self.process_util_dict = self.tp.find_target_pid(thread_utilisation_dict, start_time_ns, end_time_ns, not self.quiet)
        if target_pid and main_pid != target_pid:
            logger.info("Target PID is not the busiest PID.")
            main_pid = target_pid

        if list_pids:
            return False

        logger.info(f"Target Process: {main_pid} : {self.tp.get_process_name(main_pid)}")

        # Filter in pid
        timeslice_process_list = self.tp.filter_timeslices(timeslice_list, start_time_ns, end_time_ns, main_pid)
        timeslice_list = timeslice_process_list

        self.named_thread_util_dict = self.__create_named_thread_utilisation_dict(thread_utilisation_dict, sorted_thread_keys, main_pid)
        self.stats.process_name = self.tp.get_process_name(main_pid)

        for key in sorted_thread_keys:
            pid, tid = tu.convert_global_tid(key)
            if pid != main_pid:
                del thread_utilisation_dict[key]

        # re-sort
        sorted_thread_keys = sorted(thread_utilisation_dict, key=thread_utilisation_dict.get, reverse=True)

        # Insert the utilisation data
        self.stats.total_thread_count, self.stats.active_thread_count, self.stats.total_utilisation = self.__get_thread_counts(thread_utilisation_dict, sorted_thread_keys, main_pid)

        self.stats.busiest_thread_util = thread_utilisation_dict[sorted_thread_keys[0]]
        median_thread = math.floor(self.stats.active_thread_count / 2)
        self.stats.median_job_thread_util = thread_utilisation_dict[sorted_thread_keys[median_thread]]
        self.stats.serial_work_util = self.stats.busiest_thread_util - self.stats.median_job_thread_util

        ###############################################
        #
        # Thread Concurrency
        #
        ###############################################
        concurrent_thread_time = self.__get_thread_concurrency(timeslice_list, total_time, main_pid)
        self.concurrent_thread_list = self.__create_concurrent_thread_list(concurrent_thread_time, total_time)
        self.stats.idle_time = self.concurrent_thread_list[0]
        self.stats.one_thread_active = self.concurrent_thread_list[1] if len(self.concurrent_thread_list) > 1 else 0
        self.stats.two_threads_active = self.concurrent_thread_list[2] if len(self.concurrent_thread_list) > 2 else 0

        # Time when all active threads are working - ignore concurrency counts < active thread count
        self.stats.concurrency_of_active_threads = 0
        for ii, t in enumerate(concurrent_thread_time):
            if ii < self.stats.active_thread_count:
                continue
            self.stats.concurrency_of_active_threads += t / total_time

        ###############################################
        #
        # CPU/GPU frametimes
        #
        ###############################################
        self.stats.average_cpu_frametime_ms, frametime_list_cpu = self.tp.get_region_durations(region_type=TraceLoaderRegions.CPU_FRAMETIMES, start_time_ns=start_time_ns, end_time_ns=end_time_ns, target_pid=main_pid)
        if not frametime_list_cpu:
            logger.error("Failed to detect any Present events - unable to generate CPU frametimes.")

        self.stats.gpu_utilisation = 0
        if self.tp.is_supported(TraceLoaderSupport.GPU_METRICS) and self.tp.is_gpu_metric_supported(TraceLoaderGPUMetrics.GPU_UTILISATION):
            self.stats.gpu_utilisation = self.tp.get_average_gpu_metrics(TraceLoaderGPUMetrics.GPU_UTILISATION, start_time_ns, end_time_ns)

        if not self.stats.gpu_utilisation or self.stats.gpu_utilisation == 0:
            logger.error("Failed to detect GPU load - unable to generate GPU frametimes.")

        self.stats.average_gpu_frametime_ms = self.stats.average_cpu_frametime_ms * self.stats.gpu_utilisation

        ###############################################
        #
        # Amdahls Law
        #
        ###############################################
        total_amdahl, serial_util, parallel_util, self.stats.serial_thread_count, self.stats.job_thread_count = self.__get_amdahls_law(thread_utilisation_dict, sorted_thread_keys, self.stats.median_job_thread_util)

        ###############################################
        #
        # Health Check
        #
        ###############################################
        self.threading_health_list = self.stats.run_cpu_threading_health_check(self.cpu_info)

        ###############################################
        #
        # Generate the dataframes we want to add to pandas/excel
        #
        ###############################################
        stats_dict = self.stats.sanitise_stats_dict()
        self.tp.add_dataframe_from_dict("threading_stats", {"Statistics": stats_dict.keys(), "Values": stats_dict.values()}, None, False)

        cpu_info_dict = self.cpu_info.get_dict()
        self.tp.add_dataframe_from_dict("cpu_info", {"Statistics": cpu_info_dict.keys(), "Values": cpu_info_dict.values()}, None, False)

        self.tp.add_dataframe_from_dict("health_check", self.stats.get_health_check_as_dict(self.threading_health_list), None, False)

        self.tp.add_dataframe_from_dict(key_string="process_utilisation", d={"Processes": self.process_util_dict.keys(), "Utilisation (%)": self.process_util_dict.values()}, sort=True, sort_column="Utilisation (%)")
        self.tp.df_dict["process_utilisation"].sort_values(by=["Utilisation (%)"], ascending=False, inplace=True)

        # Transpose into 'series'
        util_dict = self.named_thread_util_dict.copy()
        for key in util_dict:
            util_dict[key] *= 100
        self.tp.add_dataframe_from_dict("thread_utilisation", {"Threads": util_dict.keys(), "Utilisation (%)": util_dict.values()}, None, False)

        if cpu_config and cpu_config.logical_p_core_count:
            # Work out P/E cores if applicable
            p_core_list = range(cpu_config.p_core_starting_index, cpu_config.p_core_starting_index + cpu_config.logical_p_core_count)
            cpu_named_thread_dict = {}

            # split into P/E cores
            for (thread, cpu), thread_value in cpu_thread_utilisation_dict.items():
                tid_name = self.tp.get_thread_name(thread)
                cpu_type = "P Core" if cpu in p_core_list else "E Core"
                p_key = (tid_name, cpu_type)
                if p_key in cpu_named_thread_dict:
                    cpu_named_thread_dict[p_key] += thread_value * 100
                else:
                    cpu_named_thread_dict[p_key] = thread_value * 100

            sorted_cpu_named_thread_dict = {}
            for key_tid in sorted_thread_keys:
                tid_name = self.tp.get_thread_name(key_tid)
                if (tid_name, "P Core") in cpu_named_thread_dict:
                    sorted_cpu_named_thread_dict[(tid_name, "P Core")] = cpu_named_thread_dict[(tid_name, "P Core")]
                if (tid_name, "E Core") in cpu_named_thread_dict:
                    sorted_cpu_named_thread_dict[(tid_name, "E Core")] = cpu_named_thread_dict[(tid_name, "E Core")]

            # Convert into a series
            df = pd.Series(sorted_cpu_named_thread_dict).reset_index()
            df.columns = ['Threads', 'P/E Core', 'Utilisation (%)']
            self.tp.df_dict["cpu_thread_utilisation"] = df

        if self.stats.average_cpu_frametime_ms > 0:
            thread_time_dict = self.named_thread_util_dict.copy()
            thread_time_dict.update({n: self.stats.average_cpu_frametime_ms * thread_time_dict[n] for n in thread_time_dict.keys()})

            self.tp.add_dataframe_from_dict("thread_time", {"Threads": thread_time_dict.keys(), "Time (ms)": thread_time_dict.values()}, None, False)

        self.tp.add_dataframe_from_dict(
            "thread_concurrency",
            {
                "Number of Threads": range(0, len(self.concurrent_thread_list)),
                "Concurrency (%)": [i * 100 for i in self.concurrent_thread_list]
            },
            None,
            False
        )

        if self.stats.average_cpu_frametime_ms > 0:
            self.tp.add_dataframe_from_dict(
                "thread_concurrency_time",
                {
                    "Number of Threads": range(0, len(self.concurrent_thread_list)),
                    "Concurrency Time (ms)": [i * self.stats.average_cpu_frametime_ms for i in self.concurrent_thread_list]
                },
                None,
                False
            )

        return True

    def get_statistics(self):
        """return CPUThreadingStatistics"""
        return self.stats

    def get_thread_utilisation_dict(self):
        return self.named_thread_util_dict

    def print_thread_utilisation_dict(self):
        logger.info(f"""
            CPU Threading Utilisation ({self.stats.process_name})
            {"=" * 50}
            {" ".join(f"{ti * 100:>6.2f} % : {tk}" for tk, ti in self.named_thread_util_dict.items())}
            {"=" * 50}
            CPU Threading Time ({self.stats.process_name})
            {"=" * 50}
            {" ".join(f"{ti * self.stats.average_cpu_frametime_ms:>6.2f} ms : {tk}" for tk, ti in self.named_thread_util_dict.items())}
            {"=" * 50}
        """)

    def get_concurrent_thread_list(self):
        return self.concurrent_thread_list

    def print_concurrent_thread_list(self):
        logger.info(f"""
            CPU Threading Concurrency ({self.stats.process_name})
            {"=" * 50}
            {" ".join(f"{t * 100:>6.2f} % : {ti:3} threads" for ti, t in enumerate(self.concurrent_thread_list))}
            {"=" * 50}
            CPU Threading Concurrency Time ({self.stats.process_name})
            {"=" * 50}
            {" ".join(f"{t * self.stats.average_cpu_frametime_ms:>6.2f} ms : {ti:3} threads" for ti, t in enumerate(self.concurrent_thread_list))}
            {"=" * 50}'
        """)

    def get_health_check_list(self):
        return self.threading_health_list

    def print_cpu_threading_health_check(self):
        return self.stats.print_cpu_threading_health_check(self.threading_health_list)
