jobinfo

#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
#
# jobinfo - collect job information from slurm in nicely readable format
#
# Copyright 2015 Anders Halager <aeh@birc.au.dk>
# More functionality added by:
# Bob Dröge <b.e.droge@rug.nl>
# Egon Rijpkema <e.m.a.rijpkema@rug.nl>
# Fokke Dijkstra <f.dijkstra@rug.nl>
#
# LICENSE: MIT

from collections import namedtuple as NT
import datetime
import json
import math
import os
import pwd
import re
import requests
import subprocess
import sys
import time


# Parameters affecting when hints about job performance will be given.
MIN_WALLTIME = 180          # Minimum walltime needed before printing job hints
MIN_MEMORY = 1.5*1024**3      # Minimum amount of memory that may be left unused per core
IGNORE_PARTITIONS = [       # Partitions for which to ignore memory and cpu usage,
    'gpu',                  # e.g. GPU partitions where GPU usage is more important,
    'gpushort',             # or where full or partial nodes are allocated anyway.
    'lab'
]

# pynumparser should have been installed to /usr/lib.
# However, if another Python version is loaded as module,
# Python may fail to find pynumparser. In this case we add it
# to the search path manually and try again.
try:
    import pynumparser
except ImportError:
    sys.path.append(f'/usr/lib/python3.{sys.version_info.minor}/site-packages/')
    import pynumparser


def append(l, x):
    if l == '':
        return x
    return ','.join(sorted(set(l.split(',') + [x])))


def keep_first(a, b):
    return a == '' and b or a


def time_max(a, b):
    if 'UNLIMITED' in [a, b]:
        return 'UNLIMITED'
    if a in ['', 'INVALID']:
        return b
    if b in ['', 'INVALID']:
        return a
    return max(a, b)


def time_min(a, b):
    if a in ['', 'INVALID', 'UNLIMITED']:
        return b
    if b in ['', 'INVALID', 'UNLIMITED']:
        return a
    return max(a, b)


def maxtot(a, b):
    c = max(a[0], b[0])
    d = a[1] + b[1]
    return (c, d)


def add(a, b):
    return a + b


def tres_mem(s=None):
    if s is None:
        return 0.0
    memval = byte_size(re.search(r'(?<=,mem=)([0-9\.]+[KMGTPE]?)(,)', s).group(1))
    return memval


def tres_disk(s=None):
    if s is None:
        return 0.0
    memval = byte_size(re.search(r'(?<=,fs\/disk=)([0-9\.]+[KMGTPE]?)', s).group(1))
    return memval


def tres_mem_d(s=None):
    if s is None:
        return (0.0, 0.0)
    memval = byte_size(re.search(r'(?<=,mem=)([0-9\.]+[KMGTPE]?)([,$])', s).group(1))
    diskval = byte_size(re.search(r'(?<=,fs\/disk=)([0-9\.]+[KMGTPE]?)([,$])', s).group(1))
    return (memval, diskval)


def byte_size(s=None):
    if s is None or s == "16?":
        return 0.0
    m = {'K': 10, 'M': 20, 'G': 30, 'T': 40, 'P': 50, 'E': 60}
    scale = 2**m.get(s[-1], 0)
    if scale != 1:
        s = s[:-1]
    return scale * float(s)


def date_str(s=None):
    if s is None or s.strip() == "":
        return "9999-01-01T00:00:00"
    return s


def format_bs(x):
    postfix = ' KMGTPE'
    e = int(math.log(x + 1, 2) / 10)
    return "%.2f%s" % (x / 2**(10 * e), postfix[e])


def parse_time(t):
    # Format: [DD-[hh:]]mm:ss
    time_parts = re.compile(r'(((?P<days>\d+)-)?(?P<hours>\d\d):)?' +
                            r'(?P<minutes>\d\d):(?P<seconds>\d\d(\.\d+)?)')
    m = time_parts.match(t)
    if m is None:
        return 0.0, 0, 0, 0
    ss = float(m.group('seconds'))
    mm = int(m.group('minutes'))
    hh = int(m.group('hours') or '0')
    dd = int(m.group('days') or '0')
    return ss, mm, hh, dd


def elapsed_to_seconds(elapsed):
    ss, mm, hh, dd = parse_time(elapsed)
    return dd * 24 * 60 * 60 + hh * 60 * 60 + mm * 60 + ss


def f_rss(x, meta):
    return "%s (%s, %s)" % (format_bs(x), meta.MaxRSSNode, meta.MaxMemPerTask)


def f_tres_i(x, meta):
    if "RUNNING" in meta.State:
        return "%s (Until last completed step)\n" \
               "Total Disk Read     : %s (Until last completed step)" % (
                   format_bs(x[0]), format_bs(x[1])
               )
    else:
        return "%s\n" \
               "Total Disk Read     : %s" % (format_bs(x[0]), format_bs(x[1]))


def f_tres_o(x, meta):
    if "RUNNING" in meta.State:
        return "%s (Until last completed step)\n" % format_bs(x)
    else:
        return "%s" % format_bs(x)


def f_dw(x, meta):
    return "%s (%s, %s)" % (format_bs(x), meta.MaxDiskWriteNode, meta.MaxDiskWritePerTask)


def f_dr(x, meta):
    return "%s (%s, %s)" % (format_bs(x), meta.MaxDiskReadNode, meta.MaxDiskReadPerTask)


def f_cpu(x, meta):
    total = elapsed_to_seconds(meta.TotalCPU)
    if total == 0:
        return "--"
    xp = elapsed_to_seconds(x)
    return "%5.2f%%" % (xp / total * 100)


def f_mem(x, meta):
    if x.endswith('c'):
        return "%s/core" % (x[:-1])
    elif x.endswith('n'):
        return "%s/node" % (x[:-1])
    else:
        return x


def f_time(x, meta):
    all_times = [meta.timelimit, meta.elapsed, meta.TotalCPU, '-']
    days_len = max(len(y.split('-')[0]) for y in all_times if '-' in y)
    ss, mm, hh, dd = parse_time(x)
    if days_len == 0:
        dd = ""
    else:
        if dd > 0:
            dd = ("%i-" % dd).rjust(days_len)
        else:
            dd = " " * (days_len + 1)
    res = "%s%02i:%02i:%02i" % (dd, hh, mm, ss)
    if res.strip() == "00:00:00":
        return "--"
    return res


def f_cputime(x, meta):
    res = f_time(x, meta)
    if res != "--":
        res += " (efficiency: %5.2f%%)" % efficiency(meta)
    return res


def f_str(x, meta):
    return str(x)


def f_date(x, meta):
    if str(x).lower() == "unknown":
        return "--"
    return str(x)


def f_state(x, meta):
    states = set(x.split(","))
    if len(states) > 1:
        states = states - set(["COMPLETED", ""])
    reason = meta.reason
    if reason != '':
        reason = ' ' + reason
    deps = meta.dependencies
    if deps != '':
        deps = " (%s)" % deps
    return ','.join(states) + reason + deps


def get_total_memory_request(memory_spec, ncpus, nnodes):
    # Convert memory_spec to bytes, ignoring the c or n at the end
    memory_req = byte_size(memory_spec[:-1])

    # If the memory request was per core, multiply by the number of cpus used
    # otherwise multiply by the number of nodes
    if memory_spec[-1] == 'c':
        memory_req = memory_req*ncpus
    else:
        memory_req = memory_req*nnodes
    return memory_req


def get_cpus_node(nodenames):
    nodename = nodenames.split(',')[-1]
    info = subprocess.Popen(['scontrol', 'show', '-o', 'node', nodename], stdout=subprocess.PIPE)
    num_cpus_match = re.search(r'cpu=(\d+)', info.stdout.read().decode('UTF-8'))
    if num_cpus_match:
        return int(num_cpus_match.group(1))
    else:
        return 1


def get_hints(meta):
    if meta.end.lower() == 'unknown':
        return

    cputime_secs = elapsed_to_seconds(meta.TotalCPU)
    walltime_secs = elapsed_to_seconds(meta.elapsed)
    # Ignore jobs without or too little time
    if cputime_secs == 0 or walltime_secs < MIN_WALLTIME:
        return
    if meta.Partition in IGNORE_PARTITIONS:
        return

    hints = []

    # CPU efficiency
    ncpus = meta.ncpus
    eff = 100 * cputime_secs / (ncpus * walltime_secs)
    if eff < 75:
        if ncpus == 1:
            hints.append(
                ["The program efficiency is low.",
                 "Check the file in- and output pattern of your application."]
            )
        elif eff <= (100.0 / ncpus):
            hints.append(
                ["The program efficiency is very low. Your program does not seem to run in",
                 "parallel. Please check the program documentation to see how to make the",
                 "program run in parallel.",
                 "If you can't find information about this, the program will not run in",
                 "parallel! Stop requesting multiple CPU cores if that is the case."]
            )
        else:
            hints.append(
                ["The program efficiency is low. Your program is not using the assigned cores",
                 "effectively. Please check if you are using all the cores you requested.",
                 "You may also need to check the file in- and output pattern of your program."]
            )
    # Memory efficiency
    # Calculate the average number of cores per node used
    cores_per_node = ncpus / meta.NNodes
    # Find the total number of cores for the node used
    cores_max_node = get_cpus_node(meta.MaxRSSNode)
    # Calculate the total amount of memory requested
    req_memory = get_total_memory_request(meta.ReqMem, ncpus, meta.NNodes)
    # If we have requested a full node, don't bother because all the memory is available.
    if cores_per_node < cores_max_node:
    # Check if at least 75% of the memory has been used and that less than MIN_MEMORY
    # per core has been left unused
        if ((meta.TRESUsageInTot[0]/req_memory) < 0.75
                and (req_memory - meta.TRESUsageInTot[0]) > (MIN_MEMORY * ncpus)):
            hints.append(
                ["You requested much more memory than your program used.",
                 "Please reduce the requested amount of memory."]
            )

    if len(hints) > 0:
        print("Hints and tips      :")
        hint_number = 1
        for hint in hints:
            hint_line = 1
            for line in hint:
                if hint_line == 1:
                    print(" %i) %s" % (hint_number, line))
                else:
                    print("    %s" % line)
                hint_line = hint_line + 1
            hint_number = hint_number + 1
        print(" *) For more information on these issues see:")
        print("    https://wiki.hpc.rug.nl/peregrine/additional_information/job_hints")


def efficiency(meta):
    cputime_secs = elapsed_to_seconds(meta.TotalCPU)
    walltime_secs = elapsed_to_seconds(meta.elapsed)
    ncpus = meta.ncpus
    if cputime_secs == 0 or walltime_secs == 0 or ncpus == 0:
        eff = 0
    else:
        eff = 100 * elapsed_to_seconds(
            meta.TotalCPU) / (meta.ncpus * elapsed_to_seconds(meta.elapsed))
    return eff


def whoami():
    return pwd.getpwuid(os.getuid()).pw_name


Field = NT('Field', 'name ctor combinator shown prefer_live formatter desc')
FIELDS = [
        Field("JobID",               str,        keep_first,   True,  True,  f_str,     "Job ID"),
        Field("JobName",             str,        keep_first,   True,  False, f_str,     "Name"),
        Field("User",                str,        keep_first,   True,  False, f_str,     "User"),
        Field("Partition",           str,        keep_first,   True,  False, f_str,     "Partition"),
        Field("NodeList",            str,        keep_first,   True,  False, f_str,     "Nodes"),
        Field("NNodes",              int,        max,          True,  False, f_str,     "Number of Nodes"),
        Field("ncpus",               int,        max,          True,  False, f_str,     "Cores"),
        Field("NTasks",              int,        max,          True,  True,  f_str,     "Number of Tasks"),
        Field("State",               str,        append,       True,  False, f_state,   "State"),
        Field("Submit",              str,        keep_first,   True,  False, f_str,     "Submit"),
        Field("start",               date_str,   min,          True,  False, f_date,    "Start"),
        Field("end",                 str,        time_max,     True,  False, f_date,    "End"),
        Field("timelimit",           str,        time_max,     True,  False, f_time,    "Reserved walltime"),
        Field("elapsed",             str,        time_max,     True,  False, f_time,    "Used walltime"),
        Field("TotalCPU",            str,        max,          True,  False, f_cputime, "Used CPU time"),
        Field("UserCPU",             str,        max,          True,  False, f_cpu,     "% User (Computation)"),
        Field("SystemCPU",           str,        max,          True,  False, f_cpu,     "% System (I/O)"),
        Field("ReqMem",              str,        keep_first,   True,  False, f_mem,     "Mem reserved"),
        Field("MaxRSS",              byte_size,  max,          True,  True,  f_rss,     "Max Mem (Node/step)"),
        Field("TRESUsageInTot",      tres_mem_d, maxtot,       True,  False, f_tres_i,  "Full Max Mem usage"),
        Field("TRESUsageOutTot",     tres_disk,  add,          True,  False, f_tres_o,  "Total Disk Write"),
        Field("MaxDiskWrite",        byte_size,  max,          False, True,  f_dw,      "Max Disk Write"),
        Field("MaxDiskRead",         byte_size,  max,          False, True,  f_dr,      "Max Disk Read"),

        Field("MaxRSSNode",          str,        keep_first,   False, True,  None,      ""),
        Field("MaxDiskWriteNode",    str,        keep_first,   False, True,  None,      ""),
        Field("MaxDiskReadNode",     str,        keep_first,   False, True,  None,      ""),
        Field("Comment",             str,        keep_first,   False, False, None,      ""),
        ]
FIELD_NAMES = [f.name for f in FIELDS]
FIELD_NAMES_LIVE = [f.name for f in FIELDS if f.prefer_live]
FIELD_CTORS = [f.ctor for f in FIELDS]
FIELD_COMB = [f.combinator for f in FIELDS]
FORMAT_STR = "--format=%s" % (",".join(FIELD_NAMES))
FORMAT_LIVE_STR = "--format=%s" % (",".join(FIELD_NAMES_LIVE))
Meta = NT(
    'Meta',
    FIELD_NAMES
    + ['MaxMemPerTask', 'MaxDiskWritePerTask', 'MaxDiskReadPerTask', 'dependencies', 'reason']
)


def combine(xs):
    r = xs[0]
    for x in xs[1:]:
        for i, comb in enumerate(FIELD_COMB):
            r[i] = comb(r[i], x[i])
    return r


def get_max_node(xs, parameter):
    maxvalue = 0
    maxpertask = "per node"
    maxnode = ""
    for row in xs:
        value = row[FIELD_NAMES.index(parameter)]
        node = row[FIELD_NAMES.index(parameter+'Node')]
        if row[FIELD_NAMES.index('NTasks')] > 1:
            pertask = "per task"
        else:
            pertask = "per node"
        if value > maxvalue:
            maxvalue = value
            maxpertask = pertask
            maxnode = node
    if maxvalue == 0:
        maxnode = "Node unknown"
        maxpertask = "N/A"
    return (maxnode, maxpertask)


def get_values(jobid):
    sacct_cmd = [
        'sacct', FORMAT_STR, '--parsable', '--noheader', '--delimiter='+u'\u2603', '-j', jobid
    ]
    info = subprocess.Popen(
        map(lambda s: s.encode('utf-8'), sacct_cmd),
        stdout=subprocess.PIPE)
    xs = []
    for line in info.stdout:
        xs.append(
            [(s != "" and ctor(s) or ctor())
             for ctor, s in zip(FIELD_CTORS,
                                line.decode('utf-8').strip().split(u'\u2603'))])
    if len(xs) == 0:
        print("No such job", file=sys.stderr)
        sys.exit(1)
    return xs


def get_live_values(jobid):
    info = subprocess.Popen(
        [
            'sstat', FORMAT_LIVE_STR, '--parsable', '--noheader', '-a', '-j',
            jobid
        ],
        stdout=subprocess.PIPE)
    xs = []
    for line in info.stdout:
        j = 0
        vals = line.decode('utf-8').strip().split('|')
        x = []
        for f in FIELDS:
            if f.prefer_live:
                x.append(f.ctor(vals[j]))
                j += 1
            else:
                x.append(f.ctor())
            xs.append(x)
    return xs


def parse_gpu_string(node_string):
    """
    Parses a string in the format of.
    pg-gpu[1-3] or pg-gpu[2,4,5]
    """
    match = re.search(r'(.+\[)([0-9]|-|,)+?(?=\])', node_string)
    if match is None:
        return [node_string]

    base, sequence = match.group().split('[')
    parser = pynumparser.NumberSequence(int)
    return ['{}{:02d}'.format(base, i) for i in parser.parse(sequence)]


def get_gpu_usage(node, start, end):
    """
    Calculate the average GPU usage between begin and end stamps.
    Args:
      node (string): The GPU node.
      start (int): start of measurements timestamp.
      end (int): end of measurements timestamp.
    """


    payload = {
        'query':
        'utilization_gpu{{env="peregrine",instance="{}:9101",job="gpu"}}'.
        format(node),
        'start':
        start,
        'end':
        end,
        'step':
        '60s'
    }

    data = requests.get(
        'https://knyft.hpc.rug.nl:9091/api/v1/query_range', params=payload)
    values = []

    for gpu in range(len(json.loads(data.content.decode())['data']['result'])):
        values += json.loads(data.content.decode())['data']['result'][gpu]['values']

    if len(values) > 0:
        average = sum([int(i[1]) for i in values]) / len(values)
    else:
        average = -1
    return average


def get_gpus_usage(nodes, start, end):
    """
    Calculate the average GPU usage between begin and end stamps.
    of a sequence of gpus.
    Args:
      nodes (string): The GPU node(s) in slurm format.
      start (int): start of measurements timestamp.
      end (int): end of measurements timestamp.
    Returns: List: A list of tuples [(<hostname>, <percentage>)]
    """
    return [(gpu, get_gpu_usage(gpu, start, end))
            for gpu in parse_gpu_string(nodes)]


def main(jobid):
    values = get_values(jobid)
    (max_mem_node, MaxMemPerTask) = get_max_node(values, 'MaxRSS')
    (max_dw_node, MaxDiskWritePerTask) = get_max_node(values, 'MaxDiskWrite')
    (max_dr_node, MaxDiskReadPerTask) = get_max_node(values, 'MaxDiskRead')
    y = combine(values)
    meta = Meta._make(y + ['', '', '', '', ''])
    meta = meta._replace(
        MaxRSSNode=max_mem_node, MaxDiskWriteNode=max_dw_node, MaxDiskReadNode=max_dr_node
    )
    ys = [y]
    if meta.State == "RUNNING" and (os.getuid() == 0 or meta.User == whoami()):
        # get more info from sstat
        tmp = get_live_values("%s,%s.batch" % (jobid, jobid))
        if len(tmp) != 0:
            (max_mem_node, MaxMemPerTask) = get_max_node(tmp, 'MaxRSS')
            (max_dw_node, MaxDiskWritePerTask) = get_max_node(tmp, 'MaxDiskWrite')
            (max_dr_node, MaxDiskReadPerTask) = get_max_node(tmp, 'MaxDiskRead')
            ys.append(combine(tmp))
    if meta.State == "PENDING":
        info = subprocess.Popen(
            ['squeue', '--format=%E;%R', '--noheader', '-a', '-j', jobid],
            stdout=subprocess.PIPE)
        deps, reason = info.stdout.readline().decode('utf-8').strip().split(";")
        dependencies = deps
    else:
        dependencies = ""
        reason = ""
    y = combine(ys)
    meta = Meta._make(
        y + [MaxMemPerTask, MaxDiskWritePerTask, MaxDiskReadPerTask,  dependencies, reason]
    )
    meta = meta._replace(
        MaxRSSNode=max_mem_node, MaxDiskWriteNode=max_dw_node, MaxDiskReadNode=max_dr_node
    )

    for i, (name, parse, comb, show, prefer_live, format,
            desc) in enumerate(FIELDS):
        val = y[i]
        if show:
            print("%-20s: %s" % (desc, format(val, meta)))

def usage(pipe):
    usage_msg = \
"""jobinfo - collates job information from the 'sstat', 'sacct' and
'squeue' SLURM commands to give a uniform interface for both current
and historical jobs.

Usage:
    jobinfo <job id>

Report problems to hpc@rug.nl"""

    print(usage_msg, file=pipe)


if __name__ == "__main__":
    if "-h" in sys.argv or "--help" in sys.argv:
        usage(sys.stdout)
        sys.exit(0)
    if len(sys.argv) != 2:
        usage(sys.stderr)
        sys.exit(1)
    jobid = sys.argv[1]
    if len(set(jobid) - set("0123456789_.")) > 0:
        print(
            "The argument does not look like a valid job id", file=sys.stderr)
        usage(sys.stderr)
        sys.exit(1)
    main(jobid)