#!/usr/bin/env python

import nsysstats

class CUDAGPUKernelSummary(nsysstats.Report):

    ARG_BASE = 'base'
    ARG_MANG = 'mangled'

    usage = f"""{{SCRIPT}}[:{ARG_BASE}|{ARG_MANG}] -- CUDA GPU Kernel Summary

    {ARG_BASE} - Optional argument, if given, will cause summary to be over the
        base name of the kernel, rather than the templated name.

    {ARG_MANG} - Optional argument, if given, will cause summary to be over the
        raw mangled name of the kernel, rather than the templated name.

        Note: the ability to display mangled names is a recent addition to the
        QDREP file format, and requires that the profile data be captured with
        a recent version of Nsys. Re-exporting an existing QDREP file is not
        sufficient. If the raw, mangled kernel name data is not available, the
        default demangled names will be used.

    Output: All time values default to nanoseconds
        Time(%) : Percentage of "Total Time"
        Total Time : Total time used by all executions of this kernel
        Instances: Number of calls to this kernel
        Average : Average execution time of this kernel
        Minimum : Smallest execution time of this kernel
        Maximum : Largest execution time of this kernel
        StdDev : Standard deviation of the time of this kernel
        Name : Name of the kernel

    This report provides a summary of CUDA kernels and their
    execution times. Note that the "Time(%)" column is calculated
    using a summation of the "Total Time" column, and represents that
    kernel's percent of the execution time of the kernels listed,
    and not a percentage of the application wall or CPU execution time.
"""

    query_stub = """
WITH
    summary AS (
        SELECT
            coalesce({NAME_COL_NAME}, demangledName) AS nameId,
            sum(end - start) AS total,
            count(*) AS num,
            avg(end - start) AS avg,
            min(end - start) AS min,
            max(end - start) AS max,
            stdev(end - start) AS stddev
        FROM
            CUPTI_ACTIVITY_KIND_KERNEL
        GROUP BY 1
    ),
    totals AS (
        SELECT sum(total) AS total
        FROM summary
    )
SELECT
    round(summary.total * 100.0 / (SELECT total FROM totals), 1) AS "Time(%)",
    summary.total AS "Total Time:dur_ns",
    summary.num AS "Instances",
    round(summary.avg, 1) AS "Average:dur_ns",
    summary.min AS "Minimum:dur_ns",
    summary.max AS "Maximum:dur_ns",
    round(summary.stddev, 1) AS "StdDev:dur_ns",
    ids.value AS "Name"
FROM
    summary
LEFT JOIN
    StringIds AS ids
    ON ids.id == summary.nameId
ORDER BY 2 DESC
;
"""

    table_checks = {
        'CUPTI_ACTIVITY_KIND_KERNEL':
            '{DBFILE} does not contain CUDA kernel data'
    }

    def setup(self):
        err = super().setup()
        if err != None:
            return err

        name_col_name = 'demangledName'
        for arg in self.args:
            if arg == self.ARG_BASE:
                name_col_name = 'shortName'
            elif arg == self.ARG_MANG:
                if self.table_col_exists('CUPTI_ACTIVITY_KIND_KERNEL', 'mangledName'):
                    name_col_name = 'mangledName'

        self.query = self.query_stub.format(NAME_COL_NAME = name_col_name)

if __name__ == "__main__":
    CUDAGPUKernelSummary.Main()
