diff --git a/PBS_Utils/__init__.py b/PBS_Utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c3deec7350020ec11fe2a6caafcfde517f41e465 --- /dev/null +++ b/PBS_Utils/__init__.py @@ -0,0 +1,3 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. diff --git a/PBS_Utils/pbs_accounting.py b/PBS_Utils/pbs_accounting.py new file mode 100644 index 0000000000000000000000000000000000000000..9ed145344c90fa9c41649cfce042546b7ddff053 --- /dev/null +++ b/PBS_Utils/pbs_accounting.py @@ -0,0 +1,1148 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import dataclasses +import datetime +import re +import time +from dataclasses import dataclass +from datetime import timezone +from pprint import pformat +from typing import List, Union, Optional + +FIELD_NOT_PROVIDED = None +VALUE_NOT_PROVIDED = "" +DATETIME_EPOCH_UTC = datetime.datetime(1970, 1, 1).replace(tzinfo=timezone.utc) + + +class EventType(object): + JOB = "JOB" + TASK = "TASK" + BOOT = "BOOT" + BLOCK = "BLOCK" + VNODE = "VNODE" + RESERVATION = "RESERVATION" + NODE_STATE = "NODE_STATE" + LICENSE = "LICENSE" + STATE = "STATE" + + +class RecType(object): + # + # PBS Record Types + # see acct.h in OpenPBS + X = "X" + S = "S" # PBS Job Start + E = "E" # PBS Job End + D = "D" # PBS Job Delete + Q = "Q" # PBS Job Queue + A = "A" # PBS Job Abort + R = "R" # PBS Job rerun + C = "C" # PBS Job checkpoint + T = "T" # PBS Job checkpoint_restart + U = "U" # PBS Reservation requested by user (awaiting confirmation) + Y = "Y" # PBS Reservation alteration confirmed by system + B = "B" # PBS Reservation begin + F = "F" # PBS Reservation finish + K = "K" # PBS Reservation removed by system + k = "k" # PBS Reservation removed by user + L = "L" # License + a = "a" # PBS Job alter + r = "r" # PBS Job resume + z = "z" # PBS Job suspend + e = "e" # PBS vnode released, shows job info, final phase + c = "c" # PBS vnode upcoming phase + s = "s" # PBS vnode request was trimmed + u = "u" # PBS vnode release, shows job info + P = "P" # PBS vnode provision start for job or reservation + p = "p" # PBS vnode provision end for job or reservation + + dct = { + S: {"action": "start", "record": S, "event_type": EventType.JOB}, + E: {"action": "end", "record": E, "event_type": EventType.JOB}, + D: {"action": "delete", "record": D, "event_type": EventType.JOB}, + Q: {"action": "queue", "record": Q, "event_type": EventType.JOB}, + A: {"action": "abort", "record": A, "event_type": EventType.JOB}, + R: {"action": "rerun", "record": R, "event_type": EventType.JOB}, + C: {"action": "checkpoint", "record": C, "event_type": EventType.JOB}, + T: {"action": "checkpoint_restart", "record": T, "event_type": EventType.JOB}, + U: {"action": "unconfirmed", "record": U, "event_type": EventType.RESERVATION}, + Y: {"action": "confirmed", "record": Y, "event_type": EventType.RESERVATION}, + B: {"action": "begin", "record": B, "event_type": EventType.RESERVATION}, + F: {"action": "finish", "record": F, "event_type": EventType.RESERVATION}, + K: {"action": "system_remove", "record": K, "event_type": EventType.RESERVATION}, + k: {"action": "remove", "record": k, "event_type": EventType.RESERVATION}, + L: {"action": "license", "record": L, "event_type": EventType.LICENSE}, + a: {"action": "alter", "record": a, "event_type": EventType.JOB}, + r: {"action": "resume", "record": r, "event_type": EventType.JOB}, + z: {"action": "suspend", "record": z, "event_type": EventType.JOB}, + e: {"action": "released", "record": e, "event_type": EventType.VNODE}, + c: {"action": "upcoming", "record": c, "event_type": EventType.VNODE}, + s: {"action": "trip", "record": s, "event_type": EventType.VNODE}, + u: {"action": "release", "record": u, "event_type": EventType.VNODE}, + P: {"action": "provision_start", "record": P, "event_type": EventType.VNODE}, + p: {"action": "provision_end", "record": p, "event_type": EventType.VNODE}, + X: {"action": "none", "record": X, "event_type": EventType.STATE}, + None: {"action": None, "record": None, "event_type": None}, + } + + +class LOG_REGEX(object): + """This class contains certain useful attributes for working with PBS accounting logs""" + + LOG_FORMAT_PBS = "%m/%d/%Y %H:%M:%S" + LOG_FORMAT_PBS_US = "%m/%d/%Y %H:%M:%S.%f" + LOG_FORMAT_PBS_US_MSB_NO_SPACE = "%Y-%m-%dT%H:%M:%S.%f" + LOG_FORMAT_ISO8601 = "%Y-%m-%dT%H:%M:%S.%f" + LOG_FORMAT_PBS_GROUPED = re.compile( + r"^(?P<MONTH>[0-9]{2})/(?P<DAY>[0-9]{2})/(?P<YEAR>[0-9]{4}) " + + r"(?P<HOUR>[0-9]{2}):(?P<MIN>[0-9]{2}):(?P<SEC>[0-9]{2})(?P<DATA>.*)$" + ) + LOG_FORMAT_DATA_SEPERATOR_PBS = ";" + LOG_FORMAT_DATA_SEPERATOR_PBS_RE = re.compile(r";([a-zA-Z]{1,3});") + LOG_FORMAT_KV_SEPERATOR_PBS = " " + PATTERN_KEY_VALUE_PAIRS = re.compile(r"^(?:\s*([\w.\-]+)=((?:([\"])(?:\\\3|(?!\3).)*\3|[^\"]*?)+))(\s*)(?=\s+[\w.\-]+=|$)") + RECTYPE = re.compile(r"^([A-Za-z]+);.*") + + +def join_record(record_value, record_id, record_keyvals): + return ";".join((record_value, record_id, record_keyvals)) + + +def split_record(data): + record_value, identifier, record_keyvals = data.split(";", 2) + return record_value.strip(), identifier.strip(), record_keyvals + + +def format_key_value_pairs(kv_pairs: dict) -> str: + """ + Given a dictionary, flatten it into a key value string used by PBS + """ + kv_pair_lst = [] + for key, value in kv_pairs.items(): + # if "'" in value: + # #value = value.replace("'", "\'") + # value = f'"{str(value)}"' + # if '"' in value: + # #value = value.replace('"', '\"') + # value = f"'{str(value)}'" + kv_pair_lst.append(f"{key}={value}") + return LOG_REGEX.LOG_FORMAT_KV_SEPERATOR_PBS.join(kv_pair_lst) + + +def func_dquote(s): + return s.strip('"').replace('\\"', '"') if (s.startswith('"') and s.endswith('"')) else s + + +def func_squote(s): + return s.strip("'").replace("\\'", "'") if (s.startswith("'") and s.endswith("'")) else s + + +def parse_key_value_pairs(row: str, record_value=None): + """This parser will pivot around the = sign""" + row = row.strip() + dct = {} + pos = 0 + # parse row and create dict, making sure there are no duplicate keys + exit_status_lst = [] + while pos < len(row): + tail = row[pos:] + match = LOG_REGEX.PATTERN_KEY_VALUE_PAIRS.match(tail) + if tail.find("=") != -1 and match: + pos += match.end(4) + 1 + key, value, _, _ = match.groups() + if key == "Exit_status": + # there can be more than one Exit_status which can related to errors, gather them up. + exit_status_lst.append(value) + dct[key] = value + continue + elif key == "Variable_List": # this can contain bash functions. + # include everything because parsing this would take much more code. + dct[key] = tail + break + if key in dct: + # make sure the dups have the same value, if so then ok + if dct[key] != value: + raise BadLineError(f'PBS KEY({key}) ERROR: {dct[key]=} != {value=}\n"{row}"') + else: + continue + dct[key] = value + elif tail.find("=") == -1: + # this fixes a message only line: + # 10/26/2021 16:24:55;A;53.pdw-s1;Job deleted as result of dependency on job 52.pdw-s1 + dct["message"] = tail + pos = len(row) + else: + # if this is an abort record then just accept this and move on. + # This is to handles lines like this: + # PBS KEY ERROR: parse error at char 1 in "Reject reply code=15001, aux=0, type=52, ..." + if record_value == RecType.A: + dct["message"] = tail + pos = len(row) + else: + raise BadLineError(f'PBS KEY ERROR: parse error at char {pos + 1} in "{row}"') + if "Exit_status" in dct: + # we need to now correct the Exit_status + if len(exit_status_lst) > 1: + int_exit_status_lst = [] + oth_exit_status_lst = [] + for es in exit_status_lst: + try: + es = int(es) + except ValueError: + oth_exit_status_lst.append(es) + else: + int_exit_status_lst.append(es) + max_exit_status = max([_ for _ in int_exit_status_lst if isinstance(_, int)]) + if max_exit_status is not None: + dct["Exit_status"] = str(max_exit_status) + else: + dct["Exit_status"] = sorted(oth_exit_status_lst).pop() if oth_exit_status_lst else "None" + dct["Exit_list"] = ",".join([str(_) for _ in exit_status_lst]) + + # fix the quotes on the values. Note, this loses and adds data. + dct = dict(map(lambda e: (e[0], func_squote(func_dquote(e[1].strip()))), dct.items())) + return dct + + +class BadLineError(Exception): + pass + + +class BadTimestampError(Exception): + pass + + +def fix_newlines_extract_time(lines, include_lineno=False, version=1, iso8601=False): + """ + times_and_lines is the list of tuples that will contain the good + scheduler_timestamps and lines + + good_lines is recreating the file but with the fixed newlines + it is used to walk backwards to find the good line and update + it in place without updating lines + """ + assert type(lines) == list, "type(lines) = %s should be a List[str]" % type(lines) + + def _sanitize(_): + return _.replace("\n", " ") + + times_and_lines = [] + good_lines = [] + for lineno, raw_line in enumerate(lines): + if version > 1: + # This correctly sets the lineno to 1 for the first line of the file. It could mess up past data. + lineno = lineno + 1 + line = _sanitize(raw_line) + try: + scheduler_timestamp, data = pbs_line_extract_time(line, lineno=lineno, iso8601=iso8601) + except BadLineError: + # remove the last line of parsed data + # remove the last good line and update it + if not times_and_lines: + continue + times_and_lines.pop() # it was bad/missing data + # go get the previous line + previous_lineno, previous_line = good_lines.pop() + bigger_line = previous_line + line + scheduler_timestamp, data = pbs_line_extract_time(bigger_line, lineno=lineno, iso8601=iso8601) + good_lines.append((previous_lineno, bigger_line)) + if version > 1: + # This sets the line to the root line. It could mess up past data. + lineno = previous_lineno + else: + good_lines.append((lineno, line)) + if include_lineno is True: + result = (scheduler_timestamp, lineno, data) + else: + result = (scheduler_timestamp, data) + times_and_lines.append(result) + return times_and_lines + + +def pbs_line_extract_time(line, lineno=-1, iso8601=False): + """pull the time and data out of a PBS line.""" + re_separator = LOG_REGEX.LOG_FORMAT_DATA_SEPERATOR_PBS_RE + parsed = re.split(re_separator, line, 1) + if len(parsed) == 3: + # This was done to fix a problem with bash functions inside Variable_List. - ERP + scheduler_timestamp, record_value, data = parsed + data = f"{record_value};{data}" + else: + raise BadLineError(f"problem splitting line#{lineno} using sep:{str(re_separator)}") + try: + if iso8601: + scheduler_timestamp = datetime.datetime.strptime(scheduler_timestamp, LOG_REGEX.LOG_FORMAT_ISO8601) + else: + scheduler_timestamp = datetime.datetime.strptime(scheduler_timestamp, LOG_REGEX.LOG_FORMAT_PBS) + except ValueError: + raise BadTimestampError(f"problem converting line#{lineno} ts:{scheduler_timestamp}") + return scheduler_timestamp, data + + +def seconds_to_hms(seconds: Union[int, float], use_days=False): + one_day = 86400 + if use_days: + days = seconds // one_day + seconds = seconds % one_day + else: + days = 0 + microseconds = seconds % 1 + seconds = seconds // 1 + hours, seconds = divmod(seconds, 3600) + minutes, seconds = divmod(seconds, 60) + days = int(days) + hours = int(hours) + minutes = int(minutes) + seconds = int(seconds) + microseconds = round(microseconds) + if days: + if days > 1: + day_str = "days" if days > 1 else "day" + else: + day_str = "days" if days < -1 else "day" + result = f"{days} {day_str}, {hours:0>2}:{minutes:0>2}:{seconds:0>2}" + else: + if hours: + result = f"{hours:0>2}:{minutes:0>2}:{seconds:0>2}" + else: + if minutes: + result = f"{minutes:0>2}:{seconds:0>2}" + else: + result = f"{seconds:0>2}" + if microseconds: + result = f"{result}.{microseconds:0>6}" + return result + + +def hms_to_seconds(hms_str): + """ + a standard hms_str has a format of hh:mm:ss + hms_str may have only mm:ss or ss portions + """ + if not hms_str or not isinstance(hms_str, str): + return None + hms_ls = hms_str.split(":") + len_hms_ls = len(hms_ls) + if len_hms_ls == 3: + hours, minutes, seconds = hms_ls + elif len_hms_ls == 2: + hours = 0 + minutes, seconds = hms_ls + elif len_hms_ls == 1: + # seconds only + hours = 0 + minutes = 0 + (seconds,) = hms_ls + else: + raise Exception(f"Invalid hms string {hms_str}") + + try: + hours = int(hours) * 3600 + minutes = int(minutes) * 60 + seconds = int(seconds) + except ValueError as err: + raise ValueError(f"Invalid hms string. Error:{err}") + if hours < 0 or minutes < 0 or seconds < 0: + result = 0 + else: + result = hours + minutes + seconds + return result + + +def get_time(scheduler_timestamp, pattern): + """given a PBS log time, return a time object.""" + datetimeobj = datetime.datetime.strptime(scheduler_timestamp, pattern) + datetimeobj = datetimeobj.replace(tzinfo=timezone.utc) + scheduler_timestamp = datetimeobj.timestamp() + return scheduler_timestamp + + +class Parse_Error_PBS(Exception): + pass + + +def cast_to_int(value): + return int(value) if value else None + + +def cast_to_list(value, list_delimiter): + try: + + def cl(d, x): + return x if type(x) == list else str(x).split(d) + + return None if value is None else cl(list_delimiter, value) + except AttributeError: + return None + + +def cast_to_vnode_list(value): + node_group = re.split(r"[+]", value) + node_lst = [] + for head in node_group: + # pull off non-node values + if "*" in head: + # partial: mom2/0*4 + # full: mom2/0*4+mom1/0+mom1/1+mom1/2+mom1/3+mom1/4+mom1/5+mom1/6 + head_tail = head.rsplit("*", 1) + if len(head_tail) == 1: + head = head_tail[0] + else: # max split is two + head, ncpus = head_tail + elif ":" in head: + # works w/at least these two cases: + # partial: (mom2:ncpus=2) + # full: (mom2:ncpus=2)+(mom2:ncpus=2) + # partial: (pdw-c3:ncpus=3:mem=1kb) + # full: (pdw-c3:ncpus=3:mem=1kb)+(pdw-c3:ncpus=3:mem=1kb)+(pdw-c4:ncpus=3:mem=1kb) + head = head.strip("(").strip(")") + head_tail = head.rsplit(":", 1) + while len(head_tail) > 1: + head_tail = head.rsplit(":") + head = head_tail[0] + elif head.count("(") == 1 and head.count(")") == 1: + # partial: (pdw-c03) + head = head.strip("(").strip(")") + # FIXME: this only works for the * case in head; + # otherwise it will only split the last head, need more examples! + + head_tail = head.rsplit("/", 1) + if len(head_tail) == 1: + node_job_index = 0 + head = head_tail[0] + else: + head, node_job_index = head_tail + node_name = head + node_lst.append(node_name) + return node_lst + + +def cast_to_float(str_val, def_val=None): + try: + return None if str_val is None else float(str_val) + except ValueError: + return def_val + + +def cast_to_list_of_x(str_val, list_delimiter, item_cast_func): + lst = cast_to_list(str_val, list_delimiter) + if lst is not None: + lst = list(map(item_cast_func, lst)) + return lst + + +def cast_to_place_dict(value) -> dict: + # arrangement, sharing, group + if value: + valid_arrangement = ["pack", "scatter", "free", "vscatter"] + valid_sharing = ["shared", "excl", "exclhost"] + dct = dict(arrangement=None, sharing=None, groups=[]) + items = value.split(":") + for item in items: + if item in valid_arrangement: + dct["arrangement"] = item + elif item in valid_sharing: + dct["sharing"] = item + elif item.startswith("group="): + _, group = item.split("=", 1) + dct["groups"].append(group) + # else: + # raise Exception("Illegal attribute or resource value Resource_List.place") + else: + dct = {} # this allows an if check + return dct + + +def join_place_dict(dct): + place = [] + if dct: + arrangement = dct["arrangement"] + sharing = dct["sharing"] + groups = dct["groups"] + if arrangement: + place.append(arrangement) + if sharing: + place.append(sharing) + if groups: + for group in groups: + place.append(f"group={group}") + return ":".join(place) + + +def cast_to_select_list(value) -> List[dict]: + if not value: + chunks = [] + else: + chunk_lst = re.split(r'\+(?=(?:[^"]*"[^"]*")*[^"]*$)', value) + chunks = [] + for chunk in chunk_lst: + chunk_dct = {} + sub_chunk_lst = re.split(r':(?=(?:[^"]*"[^"]*")*[^"]*$)', chunk) + for sub_chunk in sub_chunk_lst: + if "=" not in sub_chunk: + N = int(sub_chunk) + chunk_dct["N"] = N + else: + # even if multiple are pushed in via qsub, only the last is taken thus overwriting the value is acceptable. + resource, sub_value = sub_chunk.split("=", 1) + if sub_value.startswith('"') and sub_value.endswith('"'): + sub_value = sub_value.strip('"') + chunk_dct[resource] = sub_value + chunks.append(chunk_dct) + return chunks + + +def join_select_list(lst): + select = [] + for chunk_dct in lst: + sub_select = [] + if "N" in chunk_dct: + sub_select.append(str(chunk_dct["N"])) + for key, value in chunk_dct.items(): + if key != "N": + if ":" in value or "+" in value and not (value.startswith('"') and value.endswith('"')): + sub_select.append(f'{key}="{value}"') + else: + sub_select.append(f"{key}={value}") + select.append(":".join(sub_select)) + result = "+".join(select) + return result + + +def timedelta_to_microtime(timedeltaobj): + """Convert a timedelta object to microseconds""" + return timedeltaobj.microseconds + (timedeltaobj.seconds + timedeltaobj.days * 86400) * 1000000 + + +def epoch_to_datetime(time_val): + # 0 is possible as a default and can really mess things up, thus "not timeval" + # NOTE: loses precision. + return None if not time_val else datetime.datetime(*time.gmtime(time_val)[0:6]) + + +def datetime_to_epoch(datetime_obj: datetime) -> float: + """Given a datetime object with timezone, + return the number of seconds in epoch""" + if datetime_obj.tzinfo is None: + datetime_obj = datetime_obj.replace(tzinfo=timezone.utc) + delta = datetime_obj - DATETIME_EPOCH_UTC + return timedelta_to_microtime(delta) / 1000000 + + +def convert_none_str_to_empty_list(str_val): + return [] if str_val == "None" else str_val + + +def _get_requester(key_value_dct): + requester = key_value_dct.get("requester", FIELD_NOT_PROVIDED) + if requester == FIELD_NOT_PROVIDED: + requester = key_value_dct.get("requestor", FIELD_NOT_PROVIDED) + return requester + + +class FieldHandlers: + encode_passthrough_lst = [ + "action", + "record", + "event_type", + "identifier", + "sub_identifier", + "resource", + "action", + "requester", + "raw_user_name", + "raw_project_name", + "raw_account_name", + "raw_exec_vnode", + ] + + # used when creating PBS records. + encode_key_rename = { + "raw_project_name": "project", + "raw_account_name": "account", + "raw_user_name": "user", + "raw_exec_vnode": "exec_vnode", + } + + # These are the keys that are set by the parser + builtin_keys = [ + "action", + "record", + "event_type", + "identifier", + "sub_identifier", + "scheduler_timestamp", + ] + + parsers_from_str_lookup = {} + encoders_to_str_lookup = {} + record_handler = {} + + @staticmethod + def update_parsers(dct): + FieldHandlers.parsers_from_str_lookup.update(dct) + + @staticmethod + def update_encoders(dct): + FieldHandlers.encoders_to_str_lookup.update(dct) + + @staticmethod + def update_record_handler(dct): + FieldHandlers.record_handler.update(dct) + + @staticmethod + def process_resource(field_handler, rec_type, resource, key_value_dct, dct): + """resource can be sent in to be applied to the record, but it also can be in the record. + If it's in the record, use the one in the record. + """ + if "resource" in dct: + new_resource = field_handler.parse_from_str("resource", key_value_dct) + # if resource is not None: + # # make sure they match. + # assert new_resource == resource + if new_resource is not None: + dct["resource"] = new_resource + + @staticmethod + def parse_from_str(field: str, key_value_dct: dict, bypass=False): + try: + parser = FieldHandlers.parsers_from_str_lookup[field] + except KeyError: + if bypass: + parser = FieldHandlers.parse_pass_or_to_notprovided + else: + raise Exception(f"Parser for {field=} not found.") + result = parser(field, key_value_dct) + return result + + @staticmethod + def parse_full_record(field_handler, rec_type, resource, key_value_dct, dct): + """modifies and updates incoming dct with the parser result. + must maintain the same signature as the other parsers""" + for key, value in key_value_dct.items(): + dct[key] = field_handler.parse_from_str(key, key_value_dct, bypass=True) + return dct + + @staticmethod + def parse_record_no_builtin(field_handler, rec_type, resource, key_value_dct, dct): + """modifies and updates incoming dct with the parser result. + must maintain the same signature as the other parsers""" + for key in field_handler.parsers_from_str_lookup: + if key not in field_handler.builtin_keys: + if key in key_value_dct: + dct[key] = field_handler.parse_from_str(key, key_value_dct) + return dct + + @staticmethod + def parse_pass_or_to_notprovided(field: str, key_value_dct: dict) -> object: + result = key_value_dct.get(field, FIELD_NOT_PROVIDED) + return result + + @staticmethod + def parse_epoch_str_to_datetime(field: str, key_value_dct: dict) -> Optional[datetime.datetime]: + result = epoch_to_datetime(cast_to_float(key_value_dct.get(field, FIELD_NOT_PROVIDED))) + return result + + @staticmethod + def parse_str_to_int(field: str, key_value_dct: dict) -> Optional[int]: + result = cast_to_int(key_value_dct.get(field, FIELD_NOT_PROVIDED)) + return result + + @staticmethod + def parse_hms_to_seconds(field: str, key_value_dct: dict) -> Optional[int]: + result = hms_to_seconds(key_value_dct.get(field, FIELD_NOT_PROVIDED)) + return result + + @staticmethod + def parse_vnode_lst(field: str, key_value_dct: dict) -> Optional[list]: + value = key_value_dct.get(field, FIELD_NOT_PROVIDED) + result = cast_to_vnode_list(value) if value else value + return result + + @staticmethod + def parse_exit_status(field: str, key_value_dct: dict) -> Optional[list]: + try: + result = str(int(key_value_dct.get(field, FIELD_NOT_PROVIDED))) + except (TypeError, ValueError): + result = FIELD_NOT_PROVIDED + return result + + @staticmethod + def parse_x_used(field: str, key_value_dct: dict) -> Optional[list]: + result = cast_to_list_of_x(key_value_dct.get(field, FIELD_NOT_PROVIDED), ":", cast_to_int) + return result + + @staticmethod + def parse_place(field: str, key_value_dct: dict) -> Optional[dict]: + value = key_value_dct.get(field, FIELD_NOT_PROVIDED) + result = cast_to_place_dict(value) + return result + + @staticmethod + def parse_select(field: str, key_value_dct: dict) -> Optional[list]: + value = key_value_dct.get(field, FIELD_NOT_PROVIDED) + result = cast_to_select_list(value) + return result + + @staticmethod + def parse_pbs_ts_to_datetime(field: str, key_value_dct: dict) -> Optional[list]: + value = key_value_dct.get(field, FIELD_NOT_PROVIDED) + if value: + # value2 = dateutil.parser.parse(value) + value = datetime.datetime.strptime(value, LOG_REGEX.LOG_FORMAT_ISO8601) + if hasattr(value, "tzinfo") and value.tzinfo: + value = value.replace(tzinfo=None) + return value + + # ^^^ parsers ^^^ + # vvv encoders vvv + + @staticmethod + def encode_to_str(field: str, key_value_dct: dict, attempt_str=False): + try: + encoder = FieldHandlers.encoders_to_str_lookup[field] + except KeyError: + if attempt_str: + encoder = FieldHandlers.encode_any_to_str + else: + raise Exception(f"Encoder for {field=} not found.") + result = encoder(field, key_value_dct) + return result + + @staticmethod + def encode_full_record(field_handler, rec_type, resource, key_value_dct, dct): + """modifies and updates incoming dct with the parser result.""" + result = "" + for key in dct: + result = field_handler.encode_to_str(key, key_value_dct, attempt_str=True) + return result + + @staticmethod + def encode_any_to_str(field: str, key_value_dct: dict) -> str: + value = key_value_dct.get(field, None) + result = str(value) if value is not None else VALUE_NOT_PROVIDED + return result + + @staticmethod + def encode_datetime_to_epoch_str(field: str, key_value_dct: dict) -> str: + value = key_value_dct.get(field, None) + # valid characters + # quote only if a space. + result = str(datetime_to_epoch(value)) + return result + + @staticmethod + def encode_int_to_str(field: str, key_value_dct: dict) -> str: + return FieldHandlers.encode_any_to_str(field, key_value_dct) + + @staticmethod + def encode_float_to_str(field: str, key_value_dct: dict) -> str: + value = key_value_dct.get(field, None) + result = str(value) if value is not None else VALUE_NOT_PROVIDED + return result + + @staticmethod + def encode_str_to_str(field: str, key_value_dct: dict): + value = key_value_dct.get(field, None) + result = value if value is not None else VALUE_NOT_PROVIDED + return result + + @staticmethod + def encode_datetime_to_str(field: str, key_value_dct: dict) -> str: + value = key_value_dct.get(field, None) + # valid characters + # quote only if a space. + # result = value.strftime(LOG_REGEX.LOG_FORMAT_PBS_US) + result = value.strftime(LOG_REGEX.LOG_FORMAT_PBS) + return result + + @staticmethod + def encode_datetime_to_iso8601str(field: str, key_value_dct: dict) -> str: + value = key_value_dct.get(field, None) + # valid characters + # quote only if a space. + # result = value.strftime(LOG_REGEX.LOG_FORMAT_PBS_US) + if type(value) == str: + result = value.replace("+00:00", "") if value.endswith("+00:00") else value + else: + result = value.strftime(LOG_REGEX.LOG_FORMAT_ISO8601) + return result + + @staticmethod + def encode_seconds_to_hms(field: str, key_value_dct: dict) -> str: + value = key_value_dct.get(field, None) + result = seconds_to_hms(value) if value is not None else VALUE_NOT_PROVIDED + return result + + @staticmethod + def encode_vnode_lst(field: str, key_value_dct: dict) -> str: + value = key_value_dct.get(field, None) + result = "+".join(value) if value is not None else VALUE_NOT_PROVIDED + return result + + @staticmethod + def encode_exit_status(field: str, key_value_dct: dict) -> str: + value = key_value_dct.get(field, None) + result = str(value) if value is not None else VALUE_NOT_PROVIDED + return result + + @staticmethod + def encode_x_used(field: str, key_value_dct: dict) -> str: + value = key_value_dct.get(field, FIELD_NOT_PROVIDED) + result = ":".join(value) if value else FIELD_NOT_PROVIDED + return result + + @staticmethod + def encode_place(field: str, key_value_dct: dict) -> str: + value = key_value_dct.get(field, FIELD_NOT_PROVIDED) + result = join_place_dict(value) + return result + + @staticmethod + def encode_select(field: str, key_value_dct: dict) -> str: + value = key_value_dct.get(field, FIELD_NOT_PROVIDED) + result = join_select_list(value) + return result + + +FieldHandlers.update_parsers( + { + "ctime": FieldHandlers.parse_epoch_str_to_datetime, + "qtime": FieldHandlers.parse_epoch_str_to_datetime, + "etime": FieldHandlers.parse_epoch_str_to_datetime, + "stime": FieldHandlers.parse_epoch_str_to_datetime, + "start": FieldHandlers.parse_epoch_str_to_datetime, + "end": FieldHandlers.parse_epoch_str_to_datetime, + "Resource_List.nodect": FieldHandlers.parse_str_to_int, + "Resource_List.ncpus": FieldHandlers.parse_str_to_int, + "Resource_List.ngpus": FieldHandlers.parse_str_to_int, + "Resource_List.nproc": FieldHandlers.parse_str_to_int, + "Resource_List.place": FieldHandlers.parse_place, + "Resource_List.select": FieldHandlers.parse_select, + "account": FieldHandlers.parse_pass_or_to_notprovided, + "project": FieldHandlers.parse_pass_or_to_notprovided, + "event_type": FieldHandlers.parse_pass_or_to_notprovided, + "resource": FieldHandlers.parse_pass_or_to_notprovided, + "sub_identifier": FieldHandlers.parse_pass_or_to_notprovided, + "requester": FieldHandlers.parse_pass_or_to_notprovided, + "run_count": FieldHandlers.parse_str_to_int, + "Resource_List.walltime": FieldHandlers.parse_hms_to_seconds, + "exec_host": FieldHandlers.parse_vnode_lst, + "exec_vnode": FieldHandlers.parse_vnode_lst, + "nodes": FieldHandlers.parse_vnode_lst, + "Exit_status": FieldHandlers.parse_exit_status, + "resources_used.walltime": FieldHandlers.parse_hms_to_seconds, + "resources_used.nodect": FieldHandlers.parse_str_to_int, + "resources_used.nodes_used": FieldHandlers.parse_x_used, + "resources_used.cores_used": FieldHandlers.parse_x_used, + "session": FieldHandlers.parse_pass_or_to_notprovided, + "args": FieldHandlers.parse_pass_or_to_notprovided, + "queue": FieldHandlers.parse_pass_or_to_notprovided, + "mode": FieldHandlers.parse_pass_or_to_notprovided, + "jobname": FieldHandlers.parse_pass_or_to_notprovided, + "resvID": FieldHandlers.parse_pass_or_to_notprovided, + "resvname": FieldHandlers.parse_pass_or_to_notprovided, + "duration": FieldHandlers.parse_pass_or_to_notprovided, + "group": FieldHandlers.parse_pass_or_to_notprovided, + "exe": FieldHandlers.parse_pass_or_to_notprovided, + "cwd": FieldHandlers.parse_pass_or_to_notprovided, + "user": FieldHandlers.parse_pass_or_to_notprovided, + } +) + + +FieldHandlers.update_encoders( + { + "ctime": FieldHandlers.encode_datetime_to_epoch_str, + "qtime": FieldHandlers.encode_datetime_to_epoch_str, + "etime": FieldHandlers.encode_datetime_to_epoch_str, + "stime": FieldHandlers.encode_datetime_to_epoch_str, + "start": FieldHandlers.encode_datetime_to_epoch_str, + "end": FieldHandlers.encode_datetime_to_epoch_str, + "Resource_List.nodect": FieldHandlers.encode_int_to_str, + "Resource_List.ncpus": FieldHandlers.encode_int_to_str, + "Resource_List.ngpus": FieldHandlers.encode_int_to_str, + "Resource_List.nproc": FieldHandlers.encode_int_to_str, + "Resource_List.place": FieldHandlers.encode_place, + "Resource_List.select": FieldHandlers.encode_select, + "scheduler_timestamp": FieldHandlers.encode_datetime_to_str, + "run_count": FieldHandlers.encode_int_to_str, + "Resource_List.walltime": FieldHandlers.encode_seconds_to_hms, + "exec_host": FieldHandlers.encode_vnode_lst, + "nodes": FieldHandlers.encode_vnode_lst, + "Exit_status": FieldHandlers.encode_exit_status, + "resources_used.walltime": FieldHandlers.encode_seconds_to_hms, + "resources_used.nodect": FieldHandlers.encode_int_to_str, + "resources_used.nodes_used": FieldHandlers.encode_x_used, + "resources_used.cores_used": FieldHandlers.encode_x_used, + "account": FieldHandlers.encode_any_to_str, + "project": FieldHandlers.encode_any_to_str, + "action": FieldHandlers.encode_any_to_str, + "record": FieldHandlers.encode_any_to_str, + "event_type": FieldHandlers.encode_any_to_str, + "session": FieldHandlers.encode_any_to_str, + "user": FieldHandlers.encode_any_to_str, + "queue": FieldHandlers.encode_any_to_str, + "args": FieldHandlers.encode_any_to_str, + "mode": FieldHandlers.encode_any_to_str, + "jobname": FieldHandlers.encode_any_to_str, + "resvname": FieldHandlers.encode_any_to_str, + "resvID": FieldHandlers.encode_any_to_str, + "duration": FieldHandlers.encode_any_to_str, + "group": FieldHandlers.encode_any_to_str, + "exe": FieldHandlers.encode_any_to_str, + "cwd": FieldHandlers.encode_any_to_str, + } +) + + +def process_record_Reservation(field_handler, rec_type, resource, key_value_dct, dct): + dct["sub_identifier"] = key_value_dct.get("active_id", FIELD_NOT_PROVIDED) + dct["requester"] = _get_requester(key_value_dct) + dct["duration"] = cast_to_int(key_value_dct.get("duration", FIELD_NOT_PROVIDED)) + for key, value in key_value_dct.items(): + if key.startswith("Resource_List."): + dct[key] = field_handler.parse_from_str(key, key_value_dct, bypass=True) + elif key.startswith("resources_used."): + dct[key] = field_handler.parse_from_str(key, key_value_dct, bypass=True) + dct["Authorized_Users"] = cast_to_list( + convert_none_str_to_empty_list(key_value_dct.get("Authorized_Users", FIELD_NOT_PROVIDED)), + ",", + ) + dct["owner"] = key_value_dct.get("owner", FIELD_NOT_PROVIDED) + dct["users"] = cast_to_list(convert_none_str_to_empty_list(key_value_dct.get("users", FIELD_NOT_PROVIDED)), ",") + dct["name"] = key_value_dct.get("name", FIELD_NOT_PROVIDED) + dct["queue"] = key_value_dct.get("queue", FIELD_NOT_PROVIDED) + return dct + + +def process_record_Job_SER(field_handler, rec_type, resource, key_value_dct, dct): + dct["raw_user_name"] = dct["user"] + dct["raw_exec_vnode"] = key_value_dct.get("exec_vnode", FIELD_NOT_PROVIDED) + # pbs job arrays don't run on anything thus don't provide an exec_host + dct["Resource_List.nodes_estimated"] = cast_to_list_of_x( + key_value_dct.get("Resource_List.nodes_estimated", FIELD_NOT_PROVIDED), ":", cast_to_int + ) + dct["Resource_List.cores_estimated"] = cast_to_list_of_x( + key_value_dct.get("Resource_List.cores_estimated", FIELD_NOT_PROVIDED), ":", cast_to_int + ) + if dct["record"] == rec_type.E: + dct["Exit_status"] = field_handler.parse_from_str("Exit_status", key_value_dct) + dct["resources_used.nodect"] = cast_to_int(key_value_dct.get("resources_used.nodect", FIELD_NOT_PROVIDED)) + dct["resources_used.nodes_used"] = cast_to_list_of_x( + key_value_dct.get("resources_used.nodes_used", FIELD_NOT_PROVIDED), ":", cast_to_int + ) + dct["resources_used.cores_used"] = cast_to_list_of_x( + key_value_dct.get("resources_used.cores_used", FIELD_NOT_PROVIDED), ":", cast_to_int + ) + return dct + + +def process_record_Job_Q(field_handler, rec_type, resource, key_value_dct, dct): + return dct + + +def process_record_Job_D(field_handler, rec_type, resource, key_value_dct, dct): + dct["requester"] = _get_requester(key_value_dct) + return dct + + +FieldHandlers.update_record_handler( + { + RecType.A: lambda field_handler, rec_type, resource, key_value_dct, dct: dct, + RecType.Y: process_record_Reservation, + RecType.B: process_record_Reservation, + RecType.F: process_record_Reservation, + RecType.K: process_record_Reservation, + RecType.k: process_record_Reservation, + RecType.S: process_record_Job_SER, + RecType.E: process_record_Job_SER, + RecType.R: process_record_Job_SER, + RecType.Q: process_record_Job_Q, + RecType.D: process_record_Job_D, + } +) + + +field_handler = FieldHandlers + + +def parse_pbs_organize_keyvals(dct, resource, key_value_dct, rec_type=None): + """This merges dct and key_value_dct. + if rec_type is specified, it will use the version given""" + dct = dct.copy() + # required fields for all records + dct["resource"] = resource if resource else key_value_dct.get("resource", FIELD_NOT_PROVIDED) + dct["sub_identifier"] = key_value_dct.get("sub_identifier", "0") + rec_type = rec_type if rec_type else RecType + for key in ["account", "project", "user"]: + dct[key] = key_value_dct.get(key, FIELD_NOT_PROVIDED) + if key in key_value_dct: + if dct[key] != FIELD_NOT_PROVIDED: + dct[f"raw_{key}_name"] = dct[key] + + record = dct["record"] + field_handler.parse_record_no_builtin(field_handler, rec_type, resource, key_value_dct, dct) + if record in field_handler.record_handler: + func = field_handler.record_handler[record] + dct = func(field_handler, rec_type, resource, key_value_dct, dct) + else: + dct = field_handler.parse_full_record(field_handler, rec_type, resource, key_value_dct, dct) + field_handler.process_resource(field_handler, rec_type, resource, key_value_dct, dct) + + # create final dictionary that removes any fields that are tagged not provided + final_dct = {} + for k in dct: + if dct[k] != FIELD_NOT_PROVIDED or k in ["event_type", "record"]: + final_dct[k] = dct[k] + return final_dct + + +def enhance_pbs_record(timestamp: datetime.datetime, record_value, identifier, dct=None, rec_type: RecType = None): + """given pbs record data, enhance the record""" + if not dct: + dct = {} + else: + dct = dct.copy() + rec_type = rec_type if rec_type else RecType + try: + dct_record_value = rec_type.dct[record_value] + except (KeyError, IndexError): + # no record type found + dct_record_value = rec_type.dct[None] + dct.update(**dct_record_value) + # required fields for all records + dct["identifier"] = identifier + dct["scheduler_timestamp"] = timestamp + if "sub_identifier" not in dct: + dct["sub_identifier"] = "0" + return dct + + +def parse_pbs_log_line(timestamp: datetime.datetime, data, resource=None, rec_type: RecType = None): + """Parse the pbs log lines""" + rec_type = rec_type if rec_type else RecType + record_value, identifier, record_keyvals = split_record(data) + dct = enhance_pbs_record(timestamp, record_value, identifier, rec_type=rec_type) + if record_value == rec_type.L: + dct["resource"] = resource if resource else FIELD_NOT_PROVIDED + dct["sub_identifier"] = "0" + dct["data"] = record_keyvals + result = dct + else: + key_value_dct = parse_key_value_pairs(record_keyvals, record_value=record_value) + result = parse_pbs_organize_keyvals(dct, resource, key_value_dct, rec_type) + return result + + +def create_pbs_log_line( + timestamp: datetime.datetime, + record: str, + identifier: str, + kv_pairs: dict, + attempt_str=False, + add_extra=False, + rec_type=None, + iso8601=False, +): + """ + timestamp: scheduler timestamp + record RecType.* + identifier: jobid or reservationid or the word license, could be an uuid + kv_pairs: + attempt_str will tell the encoder to execute str(x) to convert it to a line if there is no encoder found. + + """ + rec_type = rec_type if rec_type else RecType + assert record in rec_type.dct, f"record {record} is not a valid RecType" + sep = LOG_REGEX.LOG_FORMAT_DATA_SEPERATOR_PBS + kv_sep = LOG_REGEX.LOG_FORMAT_KV_SEPERATOR_PBS + ts_us = timestamp.strftime(LOG_REGEX.LOG_FORMAT_ISO8601) + if iso8601: + ts = ts_us + else: + ts = timestamp.strftime(LOG_REGEX.LOG_FORMAT_PBS) + # make sure identifier is a specific size and characters. + head = f"{ts}{sep}{record}{sep}{identifier}" + kv_pair_lst = [] + + # rename the fields to overwrite any keys that already may have that name. + for key, value in field_handler.encode_key_rename.items(): + if key in kv_pairs: + new_key = field_handler.encode_key_rename[key] + kv_pairs[new_key] = kv_pairs[key] + + for key in kv_pairs.keys(): + if key in field_handler.encode_key_rename: + key = field_handler.encode_key_rename[key] + if key in field_handler.builtin_keys: + continue # we already have these. + if key in field_handler.encode_passthrough_lst: + value = kv_pairs.get(key, None) + value = value if value is not None else VALUE_NOT_PROVIDED + else: + value = field_handler.encode_to_str(key, kv_pairs, attempt_str=attempt_str) + kv_pair_lst.append(f"{key}={value}") + if add_extra: + kv_pair_lst.append(f"ts={ts_us}") + + tail = kv_sep.join(kv_pair_lst) + line = f"{head}{sep}{tail}" + return line + + +@dataclass +class PBSRecordKVPairs: + ts: str + te: str + kwargs: dataclasses.field(default_factory=dict) + + def __post_init__(self): + [setattr(self, k, v) for k, v in self.kwargs.items() if v is not None] + + +@dataclass +class PBSRecordBase: + timestamp: datetime + record: str + identifier: str + kv_pairs: PBSRecordKVPairs + + def get_dct(self): + dct = self.kv_pairs.kwargs.copy() + ts = self.kv_pairs.ts + te = self.kv_pairs.te + if ts: + dct["ts"] = ts + if te: + dct["te"] = te + dct["timestamp"] = self.timestamp + dct["record"] = self.record + dct["identifier"] = self.identifier + return dct + + +def create_pbs_record(timestamp: datetime.datetime, record, identifier, dct) -> PBSRecordBase: + """ + create a PBSRecordBase. + The keys in dct of ts and te are special and will be flattened into strings. + """ + ts_obj = dct["ts"].strftime(LOG_REGEX.LOG_FORMAT_ISO8601) if "ts" in dct else None + te_obj = dct["te"].strftime(LOG_REGEX.LOG_FORMAT_ISO8601) if "te" in dct else None + rkvp = PBSRecordKVPairs(ts_obj, te_obj, dct) + rb = PBSRecordBase(timestamp, record, identifier, rkvp) + return rb + + +def create_pbs_log_line_from_record(record_obj, attempt_str=False, iso8601=False): + dct = record_obj.get_dct() + dct_clean = dict((k, v) for k, v in dct.items() if v is not None) # remove nulls + del dct_clean["timestamp"] # this is sent in the next line + if "scheduler_timestamp" in dct_clean: + del dct_clean["scheduler_timestamp"] + line = create_pbs_log_line( + record_obj.timestamp, record_obj.record, record_obj.identifier, dct_clean, attempt_str=attempt_str, iso8601=iso8601 + ) + return line diff --git a/PBS_Utils/pbs_enum.py b/PBS_Utils/pbs_enum.py new file mode 100644 index 0000000000000000000000000000000000000000..4c8923b9c57041c73cd5e85a0435ff08f669097d --- /dev/null +++ b/PBS_Utils/pbs_enum.py @@ -0,0 +1,333 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +from enum import Enum, auto + + +class PBS_BATCH(Enum): + PBS_BATCH_Connect = 0 + PBS_BATCH_QueueJob = 1 + PBS_BATCH_PostQueueJob = 2 + PBS_BATCH_jobscript = 3 + PBS_BATCH_RdytoCommit = 4 + PBS_BATCH_Commit = 5 + PBS_BATCH_DeleteJob = 6 + PBS_BATCH_HoldJob = 7 + PBS_BATCH_LocateJob = 8 + PBS_BATCH_Manager = 9 + PBS_BATCH_MessJob = 10 + PBS_BATCH_ModifyJob = 11 + PBS_BATCH_MoveJob = 12 + PBS_BATCH_ReleaseJob = 13 + PBS_BATCH_Rerun = 14 + PBS_BATCH_RunJob = 15 + PBS_BATCH_SelectJobs = 16 + PBS_BATCH_Shutdown = 17 + PBS_BATCH_SignalJob = 18 + PBS_BATCH_StatusJob = 19 + PBS_BATCH_StatusQue = 20 + PBS_BATCH_StatusSvr = 21 + PBS_BATCH_TrackJob = 22 + PBS_BATCH_AsyrunJob = 23 + PBS_BATCH_Rescq = 24 + PBS_BATCH_ReserveResc = 25 + PBS_BATCH_ReleaseResc = 26 + PBS_BATCH_FailOver = 27 + PBS_BATCH_JobObit = 28 + PBS_BATCH_StageIn = 48 + PBS_BATCH_OrderJob = 50 + PBS_BATCH_SelStat = 51 + PBS_BATCH_RegistDep = 52 + PBS_BATCH_CopyFiles = 54 + PBS_BATCH_DelFiles = 55 + PBS_BATCH_MvJobFile = 57 + PBS_BATCH_StatusNode = 58 + PBS_BATCH_Disconnect = 59 + PBS_BATCH_JobCred = 62 + PBS_BATCH_CopyFiles_Cred = 63 + PBS_BATCH_DelFiles_Cred = 64 + PBS_BATCH_SubmitResv = 70 + PBS_BATCH_StatusResv = 71 + PBS_BATCH_DeleteResv = 72 + PBS_BATCH_BeginResv = 76 + PBS_BATCH_UserCred = 73 + PBS_BATCH_ConfirmResv = 75 + PBS_BATCH_DefSchReply = 80 + PBS_BATCH_StatusSched = 81 + PBS_BATCH_StatusRsc = 82 + PBS_BATCH_StatusHook = 83 + PBS_BATCH_PySpawn = 84 + PBS_BATCH_CopyHookFile = 85 + PBS_BATCH_DelHookFile = 86 + PBS_BATCH_HookPeriodic = 89 + PBS_BATCH_RelnodesJob = 90 + PBS_BATCH_ModifyResv = 91 + PBS_BATCH_ResvOccurEnd = 92 + PBS_BATCH_PreemptJobs = 93 + PBS_BATCH_Cred = 94 + PBS_BATCH_Authenticate = 95 + PBS_BATCH_ModifyJob_Async = 96 + PBS_BATCH_AsyrunJob_ack = 97 + PBS_BATCH_RegisterSched = 98 + PBS_BATCH_ModifyVnode = 99 + PBS_BATCH_DeleteJobList = 100 + + +class PBS_SCHED(Enum): + SCH_SCHEDULE_NULL = 0 + SCH_SCHEDULE_NEW = auto() + SCH_SCHEDULE_TERM = auto() + SCH_SCHEDULE_TIME = auto() + SCH_SCHEDULE_RECYC = auto() + SCH_SCHEDULE_CMD = auto() + SCH_CONFIGURE = auto() + SCH_QUIT = auto() + SCH_RULESET = auto() + SCH_SCHEDULE_FIRST = auto() + SCH_SCHEDULE_JOBRESV = auto() + SCH_SCHEDULE_AJOB = auto() + SCH_SCHEDULE_STARTQ = auto() + SCH_SCHEDULE_MVLOCAL = auto() + SCH_SCHEDULE_ETE_ON = auto() + SCH_SCHEDULE_RESV_RECONFIRM = auto() + SCH_SCHEDULE_RESTART_CYCLE = auto() + SCH_CMD_HIGH = auto() + + +class DIS_ERR(Enum): + DIS_SUCCESS = 0 + DIS_OVERFLOW = 1 + DIS_HUGEVAL = 2 + DIS_BADSIGN = 3 + DIS_LEADZRO = 4 + DIS_NONDIGIT = 5 + DIS_NULLSTR = 6 + DIS_EOD = 7 + DIS_NOMALLOC = 8 + DIS_PROTO = 9 + DIS_NOCOMMIT = 10 + DIS_EOF = 11 + + +class PBS_REPLY(Enum): + PBSE_ = 15000 + PBSE_NONE = 0 + PBSE_UNKJOBID = 15001 + PBSE_NOATTR = 15002 + PBSE_ATTRRO = 15003 + PBSE_IVALREQ = 15004 + PBSE_UNKREQ = 15005 + PBSE_TOOMANY = 15006 + PBSE_PERM = 15007 + PBSE_BADHOST = 15008 + PBSE_JOBEXIST = 15009 + PBSE_SYSTEM = 15010 + PBSE_INTERNAL = 15011 + PBSE_REGROUTE = 15012 + PBSE_UNKSIG = 15013 + PBSE_BADATVAL = 15014 + PBSE_MODATRRUN = 15015 + PBSE_BADSTATE = 15016 + PBSE_UNKQUE = 15018 + PBSE_BADCRED = 15019 + PBSE_EXPIRED = 15020 + PBSE_QUNOENB = 15021 + PBSE_QACESS = 15022 + PBSE_BADUSER = 15023 + PBSE_HOPCOUNT = 15024 + PBSE_QUEEXIST = 15025 + PBSE_ATTRTYPE = 15026 + PBSE_OBJBUSY = 15027 + PBSE_QUENBIG = 15028 + PBSE_NOSUP = 15029 + PBSE_QUENOEN = 15030 + PBSE_PROTOCOL = 15031 + PBSE_BADATLST = 15032 + PBSE_NOCONNECTS = 15033 + PBSE_NOSERVER = 15034 + PBSE_UNKRESC = 15035 + PBSE_EXCQRESC = 15036 + PBSE_QUENODFLT = 15037 + PBSE_NORERUN = 15038 + PBSE_ROUTEREJ = 15039 + PBSE_ROUTEEXPD = 15040 + PBSE_MOMREJECT = 15041 + PBSE_BADSCRIPT = 15042 + PBSE_STAGEIN = 15043 + PBSE_RESCUNAV = 15044 + PBSE_BADGRP = 15045 + PBSE_MAXQUED = 15046 + PBSE_CKPBSY = 15047 + PBSE_EXLIMIT = 15048 + PBSE_BADACCT = 15049 + PBSE_ALRDYEXIT = 15050 + PBSE_NOCOPYFILE = 15051 + PBSE_CLEANEDOUT = 15052 + PBSE_NOSYNCMSTR = 15053 + PBSE_BADDEPEND = 15054 + PBSE_DUPLIST = 15055 + PBSE_DISPROTO = 15056 + PBSE_EXECTHERE = 15057 + PBSE_SISREJECT = 15058 + PBSE_SISCOMM = 15059 + PBSE_SVRDOWN = 15060 + PBSE_CKPSHORT = 15061 + PBSE_UNKNODE = 15062 + PBSE_UNKNODEATR = 15063 + PBSE_NONODES = 15064 + PBSE_NODENBIG = 15065 + PBSE_NODEEXIST = 15066 + PBSE_BADNDATVAL = 15067 + PBSE_MUTUALEX = 15068 + PBSE_GMODERR = 15069 + PBSE_NORELYMOM = 15070 + PBSE_NOTSNODE = 15071 + PBSE_RESV_NO_WALLTIME = 15075 + PBSE_JOBNOTRESV = 15076 + PBSE_TOOLATE = 15077 + PBSE_IRESVE = 15078 + PBSE_RESVEXIST = 15080 + PBSE_resvFail = 15081 + PBSE_genBatchReq = 15082 + PBSE_mgrBatchReq = 15083 + PBSE_UNKRESVID = 15084 + PBSE_delProgress = 15085 + PBSE_BADTSPEC = 15086 + PBSE_RESVMSG = 15087 + PBSE_NOTRESV = 15088 + PBSE_BADNODESPEC = 15089 + PBSE_LICENSEINV = 15091 + PBSE_RESVAUTH_H = 15092 + PBSE_RESVAUTH_G = 15093 + PBSE_RESVAUTH_U = 15094 + PBSE_R_UID = 15095 + PBSE_R_GID = 15096 + PBSE_IBMSPSWITCH = 15097 + PBSE_UNUSED2 = 15098 + PBSE_NOSCHEDULER = 15099 + PBSE_RESCNOTSTR = 15100 + PBSE_MaxArraySize = 15107 + PBSE_INVALSELECTRESC = 15108 + PBSE_INVALJOBRESC = 15109 + PBSE_INVALNODEPLACE = 15110 + PBSE_PLACENOSELECT = 15111 + PBSE_INDIRECTHOP = 15112 + PBSE_INDIRECTBT = 15113 + PBSE_NODESTALE = 15115 + PBSE_DUPRESC = 15116 + PBSE_CONNFULL = 15117 + PBSE_LICENSE_MIN_BADVAL = 15118 + PBSE_LICENSE_MAX_BADVAL = 15119 + PBSE_LICENSE_LINGER_BADVAL = 15120 + PBSE_UNUSED3 = 15121 + PBSE_UNUSED4 = 15122 + PBSE_BAD_FORMULA = 15123 + PBSE_BAD_FORMULA_KW = 15124 + PBSE_BAD_FORMULA_TYPE = 15125 + PBSE_BAD_RRULE_YEARLY = 15126 + PBSE_BAD_RRULE_MONTHLY = 15127 + PBSE_BAD_RRULE_WEEKLY = 15128 + PBSE_BAD_RRULE_DAILY = 15129 + PBSE_BAD_RRULE_HOURLY = 15130 + PBSE_BAD_RRULE_MINUTELY = 15131 + PBSE_BAD_RRULE_SECONDLY = 15132 + PBSE_BAD_RRULE_SYNTAX = 15133 + PBSE_BAD_RRULE_SYNTAX2 = 15134 + PBSE_BAD_ICAL_TZ = 15135 + PBSE_HOOKERROR = 15136 + PBSE_NEEDQUET = 15137 + PBSE_ETEERROR = 15138 + PBSE_HISTJOBID = 15139 + PBSE_JOBHISTNOTSET = 15140 + PBSE_MIXENTLIMS = 15141 + PBSE_ENTLIMCT = 15142 + PBSE_ENTLIMRESC = 15143 + PBSE_ATVALERANGE = 15144 + PBSE_PROV_HEADERROR = 15145 + PBSE_NODEPROV_NOACTION = 15146 + PBSE_NODEPROV = 15147 + PBSE_NODEPROV_NODEL = 15148 + PBSE_NODE_BAD_CURRENT_AOE = 15149 + PBSE_NOLOOPBACKIF = 15153 + PBSE_IVAL_AOECHUNK = 15155 + PBSE_JOBINRESV_CONFLICT = 15156 + PBSE_NORUNALTEREDJOB = 15157 + PBSE_HISTJOBDELETED = 15158 + PBSE_NOHISTARRAYSUBJOB = 15159 + PBSE_FORCE_QSUB_UPDATE = 15160 + PBSE_SAVE_ERR = 15161 + PBSE_MAX_NO_MINWT = 15162 + PBSE_MIN_GT_MAXWT = 15163 + PBSE_NOSTF_RESV = 15164 + PBSE_NOSTF_JOBARRAY = 15165 + PBSE_NOLIMIT_RESOURCE = 15166 + PBSE_MOM_INCOMPLETE_HOOK = 15167 + PBSE_MOM_REJECT_ROOT_SCRIPTS = 15168 + PBSE_HOOK_REJECT = 15169 + PBSE_HOOK_REJECT_RERUNJOB = 15170 + PBSE_HOOK_REJECT_DELETEJOB = 15171 + PBSE_IVAL_OBJ_NAME = 15172 + PBSE_JOBNBIG = 15173 + PBSE_RESCBUSY = 15174 + PBSE_JOBSCRIPTMAXSIZE = 15175 + PBSE_BADJOBSCRIPTMAXSIZE = 15176 + PBSE_WRONG_RESUME = 15177 + PBSE_RESV_NOT_EMPTY = 15178 + PBSE_STDG_RESV_OCCR_CONFLICT = 15179 + PBSE_SOFTWT_STF = 15180 + PBSE_RESV_FROM_RESVJOB = 15181 + PBSE_RESV_FROM_ARRJOB = 15182 + PBSE_SELECT_NOT_SUBSET = 15183 + PBSE_RMUNKNOWN = 15201 + PBSE_RMBADPARAM = 15202 + PBSE_RMNOPARAM = 15203 + PBSE_RMEXIST = 15204 + PBSE_RMSYSTEM = 15205 + PBSE_RMPART = 15206 + RM_ERR_UNKNOWN = 15201 + RM_ERR_BADPARAM = 15202 + RM_ERR_NOPARAM = 15203 + RM_ERR_EXIST = 15204 + RM_ERR_SYSTEM = 15205 + PBSE_TRYAGAIN = 15208 + PBSE_ALPSRELERR = 15209 + PBSE_JOB_MOVED = 15210 + PBSE_SCHEDEXIST = 15211 + PBSE_SCHED_NAME_BIG = 15212 + PBSE_UNKSCHED = 15213 + PBSE_SCHED_NO_DEL = 15214 + PBSE_SCHED_PRIV_EXIST = 15215 + PBSE_SCHED_LOG_EXIST = 15216 + PBSE_ROUTE_QUE_NO_PARTITION = 15217 + PBSE_CANNOT_SET_ROUTE_QUE = 15218 + PBSE_QUE_NOT_IN_PARTITION = 15219 + PBSE_PARTITION_NOT_IN_QUE = 15220 + PBSE_INVALID_PARTITION_QUE = 15221 + PBSE_ALPS_SWITCH_ERR = 15222 + PBSE_SCHED_OP_NOT_PERMITTED = 15223 + PBSE_SCHED_PARTITION_ALREADY_EXISTS = 15224 + PBSE_INVALID_MAX_JOB_SEQUENCE_ID = 15225 + PBSE_SVR_SCHED_JSF_INCOMPAT = 15226 + PBSE_NODE_BUSY = 15227 + PBSE_DEFAULT_PARTITION = 15228 + PBSE_HISTDEPEND = 15229 + PBSE_SCHEDCONNECTED = 15230 + PBSE_NOTARRAY_ATTR = 15231 + + +class LogEventTypes(Enum): + PBSEVENT_ERROR = 0x0001 + PBSEVENT_SYSTEM = 0x0002 + PBSEVENT_ADMIN = 0x0004 + PBSEVENT_JOB = 0x0008 + PBSEVENT_JOB_USAGE = 0x0010 + PBSEVENT_SECURITY = 0x0020 + PBSEVENT_SCHED = 0x0040 + PBSEVENT_DEBUG = 0x0080 + PBSEVENT_DEBUG2 = 0x0100 + PBSEVENT_RESV = 0x0200 + PBSEVENT_DEBUG3 = 0x0400 + PBSEVENT_DEBUG4 = 0x0800 + PBSEVENT_DEBUGPRT = 0x1000 + PBSEVENT_FORCE = 0x8000 diff --git a/PBS_Utils/pbs_hook_library.py b/PBS_Utils/pbs_hook_library.py new file mode 100644 index 0000000000000000000000000000000000000000..0d4d2d0da0a04798596381d9f65f47d76edd7677 --- /dev/null +++ b/PBS_Utils/pbs_hook_library.py @@ -0,0 +1,202 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +""" +Rules +only allowed to import pbs_util.py +no importing anything except basic python packages +no creating instances of classes globally, this could lead to memory leaks. +do not import pbs, have it passed in, this allows testing and mocking more easily. + +Remember this is imported into pbs in the server and pbs_python. +Pattern for importing this: + import PBS_Utils + import PBS_Utils.pbs_hook_library + importlib.reload(PBS_Utils) + importlib.reload(PBS_Utils.pbs_hook_library) + from PBS_Utils.pbs_hook_library import add_node_state_node_for_an_event + +Never do this in a hook or hook code: + obj = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S.%f") + +TODO: figure this out + 1. Don't directly log, raise exception with message and catch it on the hook side. +""" +from typing import Optional + + +class PBS_Serializer: + def __init__(self, pbs): + self.pbs = pbs + self.server_object_key_skips = ( + "attributes", + "_attributes_hook_set", + "_attributes_unknown", + "topology_info", # contains hwloc + ) + + def serialize_pbs_module(self): + pbs = self.pbs + dct = {} + for key in dir(pbs): + val = getattr(pbs, key) + if ( + not callable(val) + and not key.startswith("__") + and key != "EXPORTED_TYPES_DICT" + and not repr(val).startswith("<module ") + ): + dct[key] = val + else: + pass + return dct + + def serialize_server_object_value(self, val, include_none=True): + pbs = self.pbs + bt = self.pbs.v1._base_types + et = self.pbs.v1._exc_types + val_type = type(val) + # if isinstance(val, (int, float, str, bool)): + # result = val # DO NOT DO THIS! + if isinstance(val, str): + result = str(val) + elif isinstance(val, int): + result = int(val) + elif isinstance(val, float): + result = float(val) + elif isinstance(val, bool): + result = bool(val) + elif isinstance(val, dict): + # note, this is weird, it's dict, but will not work right + # this fixes at least Variable_List + val = dict(val) + dct = {} + for k, v in val.items(): + if v is not None or include_none: + new_val = self.serialize_server_object_value(v, include_none=include_none) + if new_val is not None or include_none: + dct[k] = new_val + result = dct + elif isinstance(val, pbs.v1._base_types.pbs_resource): + result = dict(val) + elif isinstance(val, (bt.pbs_bool,)): + # _derived_types = (bool,) + result = bool(val) + elif isinstance(val, (bt.priority, bt.job_state)): + # (_generic_attr, int) or (int, ) + result = int(val._value) + elif isinstance(val, (bt.path, bt.sandbox, bt.version, bt.name, bt.project)): + # (_generic_attr, str) or (str, _generic_attr) + result = str(val) + elif isinstance( + val, + ( + bt.join_path, + bt.user_list, + bt.group_list, + bt.depend, + bt.checkpoint, + bt.exec_host, + bt.keep_files, + bt.mail_points, + bt.staging_list, + bt.range, + bt.state_count, + bt.place, + bt.hold_types, + ), + ): + # (_generic_attr,) + result = str(val) + # elif isinstance(val, pbs.v1._base_types._generic_attr): + # # note this will probably lose information + # result = str(val) + elif str(val_type).startswith("<class 'pbs.v1._svr_types."): + result = self.serialize_server_object(val, include_none=include_none) + elif str(val_type).startswith("<class 'pbs.v1._base_types."): + result = self.serialize_server_object(val, include_none=include_none) + elif val_type in (tuple, list, set): + lst = [] + for o in val: + if o is not None or include_none: + lst.append(self.serialize_server_object_value(o, include_none=include_none)) + result = val_type(lst) + elif val_type == dict: + dct = {} + for k, v in val.items(): + if v is not None or include_none: + new_val = self.serialize_server_object_value(v, include_none=include_none) + if new_val is not None or include_none: + dct[k] = new_val + result = dct + else: + result = val + if not include_none and result == "": + result = None + return result + + def serialize_server_object(self, obj, include_none=True): + # include_node==False removes None values + dct = {} + for key in dir(obj): + if not key.startswith("__") and key not in self.server_object_key_skips: + val = getattr(obj, key) + if not callable(val): + if key == "_param" and isinstance(val, dict): + for k, v in val.items(): + if v is not None or include_none: + dct[k] = self.serialize_server_object_value(v, include_none=include_none) + else: + if val is not None or include_none: + new_val = self.serialize_server_object_value(val, include_none=include_none) + if new_val is not None or include_none: + dct[key] = new_val + return dct + + +def add_node_state_via_server(server_obj, event_obj, context: str, node: str, state) -> Optional[str]: + """for server side hooks or to offline any node.""" + err = None + vnode_obj = None + + try: + vnode_obj = server_obj.vnodes[node] + except KeyError: + err = f"{context}; Missing {node} in server_obj.vnodes" + else: + if event_obj and hasattr(event_obj, "vnode_list_fail"): + event_obj.vnode_list_fail[node] = vnode_obj + + if vnode_obj: + if vnode_obj.state: + vnode_obj.state += state + else: + vnode_obj.state = state + vnode_obj.comment = err + return err + + +def add_node_state_node_for_an_event(event_obj, node: str, state, comment=None) -> Optional[str]: + """for offing nodes for a mom associated with the event.""" + err = None + vnode_obj = None + + try: + vnode_obj = event_obj.vnode_list_fail[node] + except KeyError: + try: + vnode_obj = event_obj.vnode_list[node] + except KeyError: + err = f"Missing {node} in event_obj.vnode_list." + else: + event_obj.vnode_list_fail[node] = vnode_obj + + if vnode_obj: + if vnode_obj.state: + vnode_obj.state += state + else: + vnode_obj.state = state + if comment: + vnode_obj.comment = comment + return err diff --git a/PBS_Utils/pbs_instrument.py b/PBS_Utils/pbs_instrument.py new file mode 100644 index 0000000000000000000000000000000000000000..551c13c5dc2dd72390afecd056eae35fd3162713 --- /dev/null +++ b/PBS_Utils/pbs_instrument.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import json +import os +import uuid +import time +import socket +import inspect +import functools +from os import linesep +from contextlib import contextmanager +from typing import List, TypedDict +from PBS_Utils.pbs_util import get_now + +OT_TRACER = None # there must be only one OT tracer. + + +def formatter(span): + return span.to_json(indent=None) + linesep + + +def get_callinfo(jump_back_count=1, raw=True): + current_frame = inspect.currentframe() + frame, filename, line_number, function_name, lines, index = inspect.getouterframes(current_frame)[jump_back_count] + fpath, fname = os.path.split(filename) + lineno = frame.f_lineno + if raw: + result = {"fname": fname, "lineno": lineno, "filename": filename, "index": index, "function_name": function_name} + else: + result = "%-13s:L#%s" % (fname, lineno) + return result + + +# ---- vvv ---- TracerLogger ---- vvv ---- +class TimeIt: + def __init__(self): + self.ts = None + self.te = None + + def __enter__(self): + self.ts = time.time() + return self + + def __exit__(self, *args): + self.te = time.time() + + def get_td(self): + return self.te - self.ts + + +class Timings(TypedDict): + name: str # name of timing + ts: float # time start of scope + te: float # time end of scope + call_i: dict # call in info + call_e: dict # call exit info + td_src: float # time of just the source code + td_total: float # total time + + +class TracerSimple: + def __init__(self, name, output_path): + self.name = name + self.current_span_name = "instrument_setup" + self.output_path = output_path + self.traces: List[Timings] = [] + self.instrument_timings = [] + self.ts_inst = time.time() + with TimeIt() as ti: + self.call_i = get_callinfo(jump_back_count=5) + self.instrument_timings.append(ti.get_td()) + self.te_inst = self.ts_inst + with TimeIt() as ti: + self.call_e = get_callinfo(jump_back_count=5) + self.instrument_timings.append(ti.get_td()) + self.td_code = 0.0 + self.ts_code = time.time() + self.te_code = self.ts_code + + @contextmanager + def start_as_current_span(self, name, *args, **kwargs): + with TimeIt() as ti_enter: + call_i = get_callinfo(jump_back_count=3) + td_start_inst = ti_enter.get_td() + self.instrument_timings.append(td_start_inst) + self.current_span_name = name + with TimeIt() as ti_src: + yield self + ts_src = ti_src.ts + te_src = ti_src.te + with TimeIt() as ti_exit: + call_e = get_callinfo(jump_back_count=3) + td_end_inst = ti_exit.get_td() + self.instrument_timings.append(td_end_inst) + td_src = te_src - ts_src + td_total = td_start_inst + td_end_inst + td_src + self.traces.append( + Timings(name=name, ts=ts_src, te=te_src, call_i=call_i, call_e=call_e, td_src=td_src, td_total=td_total) + ) + + def shutdown(self): + self.te_code = time.time() + self.td_code = self.te_code - self.ts_code + self.current_span_name = "shutdown" + with TimeIt() as ti: + self.call_e = get_callinfo(jump_back_count=4) + self.instrument_timings.append(ti.get_td()) + self.te_inst = time.time() + td_src = self.te_code - self.ts_code + td_total = self.te_inst - self.ts_inst + self.traces.append( + Timings( + name=self.name, + ts=self.ts_code, + te=self.te_code, + call_i=self.call_i, + call_e=self.call_e, + td_src=td_src, + td_total=td_total, + ) + ) + if self.output_path: + with open(self.output_path, "w") as fd: + for dct in self.traces: + fd.write(json.dumps(dct)) + fd.write("\n") + + def get_current_span(self): + # todo: this would require a refactor to create new spans + return self + + +# ---- ^^^ ---- TracerLogger ---- ^^^ ---- + + +# ---- vvv ---- TracerFake ---- vvv ---- +class TracerFake: + def __init__(self, name, output_path): + self.name = name + self.output_path = output_path + + @contextmanager + def start_as_current_span(self, name, *args, **kwargs): + yield self + + def shutdown(self): + pass + + def get_current_span(self): + return self + + +# ---- ^^^ ---- TracerFake ---- ^^^ ---- + + +def get_tracer(trace_name, output_path, tracer_class): + return tracer_class(trace_name, output_path) + + +@contextmanager +def instrument_setup(tracer_name, output_path, tracer_class=None): + if tracer_name and not tracer_class: + global OT_TRACER + from opentelemetry import trace + from opentelemetry.sdk.trace import TracerProvider, Tracer + from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter + + # Only can have one tracer per program. Do not call more than once per program. + outer_tracer: Tracer = trace.get_tracer(tracer_name) + OT_TRACER = outer_tracer + # trace.set_tracer_provider(TracerProvider(shutdown_on_exit=True)) + trace.set_tracer_provider(TracerProvider(shutdown_on_exit=False)) + fd = open(output_path, "a") + bsp = BatchSpanProcessor(ConsoleSpanExporter(out=fd, formatter=formatter)) + trace.get_tracer_provider().add_span_processor(bsp) + try: + yield outer_tracer + finally: + fd.flush() + bsp.shutdown() + fd.close() + OT_TRACER = None + else: + tracer_class = tracer_class if tracer_class else TracerFake + outer_tracer = get_tracer(tracer_name, output_path, tracer_class) + try: + yield outer_tracer + finally: + outer_tracer.shutdown() + + +def ot_decorate_span(tracer_inner): + """wrap a function call with opentelemetry.""" + pid = os.getpid() + hostname = socket.gethostname() + + def outer_decorate(func): + + @functools.wraps(func) + def decorate(*args, **kwargs): + global OT_TRACER + if OT_TRACER: + from opentelemetry.trace.status import Status, StatusCode + + current_frame = inspect.currentframe() + frame, filename, line_number, function_name, lines, index = inspect.getouterframes(current_frame)[1] + now_str = get_now().strftime("%Y-%m-%d %H:%M:%S.%f") + # all long calls must be done before the with line. + with tracer_inner.start_as_current_span(f"{func.__name__}") as span: + span.set_attribute("hostname", hostname) + span.set_attribute("pid", pid) + span.set_attribute("timestamp", now_str) + span.set_attribute("filename", filename) + span.set_attribute("lineno", line_number) + span.set_attribute("func_name", function_name) + try: + result = func(*args, **kwargs) + except Exception as err: + span.record_exception(err) + span.set_status(Status(StatusCode.ERROR)) + raise + else: + # we don't have a tracer anymore, but let's still execute. + result = func(*args, **kwargs) + return result + + return decorate + + return outer_decorate diff --git a/PBS_Utils/pbs_library.py b/PBS_Utils/pbs_library.py new file mode 100644 index 0000000000000000000000000000000000000000..563323e2766af2f9b38a20c4af9f38fddf40c4aa --- /dev/null +++ b/PBS_Utils/pbs_library.py @@ -0,0 +1,723 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import tempfile +import time +import abc +import json +import os +import socket +import subprocess +import uuid +from pathlib import Path +from pprint import pformat +from typing import Dict, List, Optional, Union +from datetime import datetime +from typing import TypedDict +from PBS_Utils.pbs_util import timedelta_str_to_td +from PBS_Utils.pbs_util import get_now + + +class PBSMetaData(TypedDict): + timestamp: datetime + pbs_version: str + pbs_server: str + hostname: str + + +class PBSResponseData(TypedDict): + metadata: PBSMetaData + data: Optional[Union[Dict, List]] + + +class PBSResvData(TypedDict): + name: str + + +class PBSNodeData(TypedDict): + Mom: str + Port: str + last_state_change_time: str # 1612825851 + last_used_time: str # 1612825851 + license: str + ntype: str + pbs_version: str + pcpus: str # 8 + resources_assigned: Dict[str, str] + resources_available: Dict[str, str] + resv_enable: str # True + server_instance_id: str + sharing: str + state: str + + +def decorator_reconnect(method): + def decorate(slf, *args, **kwargs): + now = time.time() + if (now - slf.last_connect) > slf.timeout: + if slf.debug: + print(f"reconnecting due to timeout of {slf.timeout}. last_connect:{slf.last_connect} now:{now}") + slf.reconnect() + result = method(slf, *args, **kwargs) + return result + + return decorate + + +class PBSIFLError(Exception): + pass + + +class PBSInterfaceABC(metaclass=abc.ABCMeta): + def __init__(self, machine_name=None, debug=False): + self.machine_name = machine_name + self.debug = debug + + @abc.abstractmethod + def get_pbs_nodes(self, nodes=[]) -> PBSResponseData: + raise NotImplemented() + + @abc.abstractmethod + def get_pbs_vnodes(self, nodes=[]) -> PBSResponseData: + raise NotImplemented() + + @abc.abstractmethod + def get_qstat(self): + raise NotImplemented() + + +def get_metadata(machine_name) -> PBSMetaData: + now = get_now(notz=True) + version = None + # FIXME: if you do this, it will destroy the connection! + server = "" # pbs.server().name + result_dct = { + "timestamp": now, + "pbs_version": version, + "pbs_server": server, + "hostname": socket.gethostname(), + "machine_name": machine_name, + "pid": os.getpid(), + "uuid": uuid.uuid4().hex, + } + return result_dct + + +class PBSCommandInterface(PBSInterfaceABC): + def __init__(self, machine_name=None, debug=False): + self.cmd_pbsnodes = "/opt/pbs/bin/pbsnodes" + self.cmd_qstat = "/opt/pbs/bin/qstat" + + cmd_lst = [self.cmd_pbsnodes, self.cmd_qstat] + missing_cmd_lst = [] + for cmd in cmd_lst: + if not os.path.exists(cmd): + missing_cmd_lst.append(cmd) + if len(missing_cmd_lst) > 0: + raise ImportError(f"cannot load commands:{','.join(missing_cmd_lst)}") + + super().__init__(machine_name=machine_name, debug=debug) + + def get_pbs_nodes(self, nodes=[]) -> PBSResponseData: + # FIXME: get nodes to work + cmd_lst = [self.cmd_pbsnodes, "-aF", "json"] + pipe = subprocess.PIPE + proc = subprocess.Popen(cmd_lst, stdout=pipe, stderr=pipe) + stdout: bytes + stderr: bytes + stdout, stderr = proc.communicate() + returncode = proc.returncode + if returncode == 0: + node_data = json.loads(stdout.decode().replace("\n", "")) + else: + if b"no node list" in stderr: + node_data = { + "nodes": {}, + "timestamp": time.time(), + # 'pbs_version': ?, + # 'pbs_server': ?, + } + else: + node_data = {} + if self.debug: + print(f"Error: cmd_lst: {cmd_lst} returned {returncode}") + print(f"stdout:{stdout}") + print(f"stderr:{stderr}") + result_dct = { + "metadata": get_metadata(self.machine_name), + "data": node_data, + } + return result_dct + + def pbs_qstat(self): + cmd_lst = [self.cmd_qstat, "-t", "-f", "-F", "json"] + pipe = subprocess.PIPE + proc = subprocess.Popen(cmd_lst, stdout=pipe, stderr=pipe) + stdout: bytes + stderr: bytes + stdout, stderr = proc.communicate() + returncode = proc.returncode + if self.debug: + print(f"Error: cmd_lst: {cmd_lst} returned {returncode}") + print(f"stdout:{stdout}") + print(f"stderr:{stderr}") + if returncode == 0: + dct = json.loads(stdout.decode().replace("\n", "")) + else: + dct = None + return dct + + def get_pbs_vnodes(self, nodes=[]) -> PBSResponseData: + # FIXME: get nodes to work + cmd_lst = [self.cmd_pbsnodes, "-avF", "json"] + pipe = subprocess.PIPE + proc = subprocess.Popen(cmd_lst, stdout=pipe, stderr=pipe) + stdout: bytes + stderr: bytes + stdout, stderr = proc.communicate() + returncode = proc.returncode + if returncode == 0: + node_data = json.loads(stdout.decode().replace("\n", "")) + else: + if b"no node list" in stderr: + node_data = { + "nodes": {}, + "timestamp": time.time(), + # 'pbs_version': ?, + # 'pbs_server': ?, + } + else: + node_data = {} + if self.debug: + print(f"Error: cmd_lst: {cmd_lst} returned {returncode}") + print(f"stdout:{stdout}") + print(f"stderr:{stderr}") + result_dct = { + "metadata": get_metadata(self.machine_name), + "data": node_data, + } + return result_dct + + +class PBSIFLInterface(PBSInterfaceABC): + + def __init__(self, machine_name=None, host=None, debug=False, pbs_ifl=None): + if pbs_ifl is None: + import pbs_ifl + self.pbs_ifl = pbs_ifl + self.host = host + self.last_connect = time.time() + self.conn = pbs_ifl.pbs_connect(None) + self.timeout = 1800 - 60 + if self.conn == -1: + raise PBSIFLError(f"Error: Could not connect to pbs_ifl on host:{self.host}.") + super().__init__(machine_name, debug=debug) + + def __del__(self): + if hasattr(self, "conn") and self.conn: + self.pbs_ifl.pbs_disconnect(self.conn) + + def disconnect(self): + if hasattr(self, "conn") and self.conn: + self.pbs_ifl.pbs_disconnect(self.conn) + + def reconnect(self): + now = time.time() + if self.debug: + print(f"reconnecting to pbs_ifl. last_connect:{self.last_connect} now:{now}") + self.disconnect() + self.conn = self.pbs_ifl.pbs_connect(None) + self.last_connect = now + if self.conn == -1: + if self.host is None: + host = self.get_pbs_server_host() + else: + host = self.host + raise PBSIFLError(f"Error: Could not connect to pbs_ifl on host:{host}.") + + def get_pbs_server_host(self): + server = self.pbs_ifl.get_pbs_server() + return server + + def check_for_error(self, func, bstat_head): + errmsg = self.pbs_ifl.pbs_geterrmsg(self.conn) + errno = self.pbs_ifl.get_pbs_errno() + if self.debug: + print(f"pbs_ifl errno:{errno}, func:{func}, bstat_head:{bstat_head}") + if errno == 15064: + pass # Server has no node list + elif errno != 0: + # errors we want to comment on, but still throw an error + if errno == 15136: + print(f"pbs_ifl hook error errno:{errno} errmsg:{errmsg} func:{func} bstat_head:{bstat_head}") + elif errno == 15020 or errno == 15031: + self.reconnect() + raise PBSIFLError(pformat(f"pbs_ifl(reconnected) errno:{errno} txt:{self.pbs_ifl.pbse_to_txt(errno)}")) + raise PBSIFLError(pformat(f"pbs_ifl errno:{errno} txt:{self.pbs_ifl.pbse_to_txt(errno)}")) + + def get_manager_cmd(self, command): + # FIXME: this should be in pbs_ifl as a reverse like in _svr_types.py + pbs_ifl = self.pbs_ifl + if command == "create": + result = pbs_ifl.MGR_CMD_CREATE + elif command == "delete": + result = pbs_ifl.MGR_CMD_DELETE + elif command == "set": + result = pbs_ifl.MGR_CMD_SET + elif command == "unset": + result = pbs_ifl.MGR_CMD_UNSET + elif command == "list": + result = pbs_ifl.MGR_CMD_LIST + elif command == "print": + result = pbs_ifl.MGR_CMD_PRINT + elif command == "active": + result = pbs_ifl.MGR_CMD_ACTIVE + elif command == "import": + result = pbs_ifl.MGR_CMD_IMPORT + elif command == "export": + result = pbs_ifl.MGR_CMD_EXPORT + else: + raise Exception(f"invalid command {command}") + return result + + def get_manager_obj(self, obj): + # FIXME: this should be in pbs_ifl as a reverse like in _svr_types.py + pbs_ifl = self.pbs_ifl + if obj == "server": + result = pbs_ifl.MGR_OBJ_SERVER + elif obj == "queue": + result = pbs_ifl.MGR_OBJ_QUEUE + elif obj == "job": + result = pbs_ifl.MGR_OBJ_JOB + elif obj == "node": + result = pbs_ifl.MGR_OBJ_NODE + elif obj == "reservation": + result = pbs_ifl.MGR_OBJ_RESV + elif obj == "resource": + result = pbs_ifl.MGR_OBJ_RSC + elif obj == "host": + result = pbs_ifl.MGR_OBJ_HOST + elif obj == "hook": + result = pbs_ifl.MGR_OBJ_HOOK + elif obj == "pbs_hook": + result = pbs_ifl.MGR_OBJ_PBS_HOOK + elif obj == "jobarray_parent": + result = pbs_ifl.MGR_OBJ_JOBARRAY_PARENT + elif obj == "subjob": + result = pbs_ifl.MGR_OBJ_SUBJOB + else: + raise Exception(f"invalid object {obj}") + return result + + @decorator_reconnect + def _get_pbs_job_data(self, jobid: str, attrib, extend, err_handler) -> Dict: + """ + return a dictionary with jobid as the key and the entire job data as the value. + """ + pbs_ifl = self.pbs_ifl + bstat_head = pbs_ifl.pbs_statjob(self.conn, jobid, attrib, extend) + try: + self.check_for_error(pbs_ifl.pbs_statjob, bstat_head) + except PBSIFLError as err: + if err_handler: + err_handler(err) + else: + raise + data = process_standard_bstat_head(bstat_head, pbs_ifl) + return data + + def get_pbs_jobs(self, jobids=[], attrib=None, extend=None, score=False, err_handler=None) -> PBSResponseData: + if score: + try: + # FIXME: This is not the dotted server. How do we get the full server. + # server = self.get_pbs_server() + # job_sort_formula = self.get_pbs_server(attrl=attrl)['data'][server]['job_sort_formula'] + attrl = self.get_attrl("job_sort_formula") + dct = list(self.get_pbs_server(attrl=attrl)["data"].values()).pop() + job_sort_formula = dct["job_sort_formula"] + except Exception: + job_sort_formula = None + else: + job_sort_formula = None + dct = {} + if not jobids: + job_dct = self._get_pbs_job_data("", attrib, extend, err_handler) + dct.update(job_dct) + else: + for jobid in jobids: + job_dct = self._get_pbs_job_data(jobid, attrib, extend, err_handler) + dct.update(job_dct) + + if score and job_sort_formula: + for jobid, job_dct in dct.items(): + score_vars = {"walltime": 0} + # set server eligible_time_enable = True + score_vars["eligible_time"] = timedelta_str_to_td(job_dct["eligible_time"]).total_seconds() + for key, value in job_dct["Resource_List"].items(): + if key in ["walltime"]: + try: + score_vars[key] = timedelta_str_to_td(value).total_seconds() + except (TypeError, ValueError): + pass + else: + try: + score_vars[key] = int(value) + except (TypeError, ValueError): + pass + if self.debug: + print(pformat(job_sort_formula)) + print(pformat(score_vars)) + score_value = eval(job_sort_formula, score_vars) + job_dct["score"] = score_value + + result_dct = { + "metadata": get_metadata(self.machine_name), + "data": dct, + } + return result_dct + + @decorator_reconnect + def _get_pbs_node_data(self, node: str) -> Dict[str, PBSNodeData]: + pbs_ifl = self.pbs_ifl + bstat_head = pbs_ifl.pbs_statnode(self.conn, node, None, None) + self.check_for_error(pbs_ifl.pbs_statnode, bstat_head) + data = process_standard_bstat_head(bstat_head, pbs_ifl) + return data + + def get_pbs_nodes(self, nodes=[], debug=False) -> PBSResponseData: + data = {} + if not nodes: + dct = self._get_pbs_node_data("") + data.update(dct) + else: + for node in nodes: + dct = self._get_pbs_node_data(node) + data.update(dct) + if self.debug: + print(pformat(data)) + result_dct = { + "metadata": get_metadata(self.machine_name), + "data": data, + } + return result_dct + + def get_qstat(self): + raise NotImplemented() + + @decorator_reconnect + def _get_pbs_vnode_data(self, node: str) -> Dict[str, PBSNodeData]: + pbs_ifl = self.pbs_ifl + func = pbs_ifl.pbs_statvnode + bstat_head = func(self.conn, node, None, None) + self.check_for_error(func, bstat_head) + data = process_standard_bstat_head(bstat_head, pbs_ifl) + return data + + def get_pbs_vnodes(self, nodes=[]) -> PBSResponseData: + node_data = {} + if not nodes: + node_dct = self._get_pbs_vnode_data("") + node_data.update(node_dct) + else: + for node in nodes: + node_dct = self._get_pbs_vnode_data(node) + node_data.update(node_dct) + if self.debug: + print(pformat(node_data)) + result_dct = { + "metadata": get_metadata(self.machine_name), + "data": node_data, + } + return result_dct + + def get_attropl(self, name, value=None, resource=None, next_attropl=None, batch_op: int = None): + pbs_ifl = self.pbs_ifl + attropl = pbs_ifl.attropl() + attropl.name = name + attropl.value = value + if resource: + attropl.resource = resource + if next_attropl: + attropl.next = next_attropl + if batch_op: + attropl.op = batch_op + # todo: where is freeattropl or free_attropl hiding... + return attropl + + def get_attrl(self, name, value=None, resource=None, next_attrl=None): + pbs_ifl = self.pbs_ifl + attrl = pbs_ifl.attrl() + attrl.name = name + attrl.value = value + if resource: + attrl.resource = resource + if next_attrl: + attrl.next = next_attrl + # todo: where is free_attrl_list or free_attrl + return attrl + + @decorator_reconnect + def _get_pbs_server_data(self, attrl=None) -> Dict[str, PBSNodeData]: + pbs_ifl = self.pbs_ifl + func = pbs_ifl.pbs_statserver + bstat_head = func(self.conn, attrl, None) + self.check_for_error(func, bstat_head) + data = process_standard_bstat_head(bstat_head, pbs_ifl) + return data + + def get_pbs_server(self, attrl=None) -> PBSResponseData: + data = self._get_pbs_server_data(attrl=attrl) + if self.debug: + print(pformat(data)) + result_dct = { + "metadata": get_metadata(self.machine_name), + "data": data, + } + return result_dct + + @decorator_reconnect + def _get_pbs_queue_data(self, queue_name=None) -> Dict[str, PBSNodeData]: + pbs_ifl = self.pbs_ifl + func = pbs_ifl.pbs_statque + bstat_head = func(self.conn, queue_name, None, None) + self.check_for_error(func, bstat_head) + data = process_standard_bstat_head(bstat_head, pbs_ifl) + return data + + def get_pbs_queues(self, queue_name=None) -> PBSResponseData: + data = self._get_pbs_queue_data(queue_name=queue_name) + if self.debug: + print(pformat(data)) + result_dct = { + "metadata": get_metadata(self.machine_name), + "data": data, + } + return result_dct + + @decorator_reconnect + def _get_pbs_resv_data(self, resvid: str) -> Dict[str, PBSResvData]: + pbs_ifl = self.pbs_ifl + bstat_head = pbs_ifl.pbs_statresv(self.conn, resvid, None, None) + self.check_for_error(pbs_ifl.pbs_statresv, bstat_head) + data = process_standard_bstat_head(bstat_head, pbs_ifl) + return data + + def get_pbs_reservations(self, resvids=[]) -> PBSResponseData: + dct = {} + if not resvids: + resv_dct = self._get_pbs_resv_data("") + dct.update(resv_dct) + else: + for resvid in resvids: + resv_dct = self._get_pbs_job_data(resvid) + dct.update(resv_dct) + result_dct = { + "metadata": get_metadata(self.machine_name), + "data": dct, + } + return result_dct + + def pbs_stathook(self, hook_name, attrib_lst, extend) -> PBSResponseData: + pbs_ifl = self.pbs_ifl + bstat_head = pbs_ifl.pbs_stathook(self.conn, hook_name, attrib_lst, extend) + self.check_for_error(pbs_ifl.pbs_stathook, bstat_head) + data = process_standard_bstat_head(bstat_head, pbs_ifl) + return data + + def pbs_statserver(self, pbs_server, attrl_name): + pbs_ifl = self.pbs_ifl + extend = None + if attrl_name: + attrl = pbs_ifl.attrl() + attrl.name = attrl_name + else: + attrl = "" + bstat_head = pbs_ifl.pbs_statserver(self.conn, attrl, extend) + self.check_for_error(pbs_ifl.pbs_manager, bstat_head) + data = process_standard_bstat_head(bstat_head, pbs_ifl) + if attrl_name: + server_data = data[pbs_server] + try: + result = server_data[attrl_name] + except KeyError: + result = None # an attribute is not set in the server. + else: + result = data + return result + + def pbs_manager(self, command, object_type, object_name="", attropl=None, extend=None) -> PBSResponseData: + if type(command) == str: + command = self.get_manager_cmd(command) + if type(object_type) == str: + object_type = self.get_manager_obj(object_type) + pbs_ifl = self.pbs_ifl + errno = pbs_ifl.pbs_manager(self.conn, command, object_type, object_name, attropl, extend) + self.check_for_error(pbs_ifl.pbs_manager, None) + return errno + + def pbs_manager_set_and_wait(self, attrl_name, attropl_value, delay=0.1): + command = self.pbs_ifl.MGR_CMD_SET + obj_type = self.pbs_ifl.MGR_OBJ_SERVER + pbs_server = self.get_pbs_server() + attropl = self.get_attropl(attrl_name, attropl_value) + self.pbs_manager(command, obj_type, attropl=attropl) + count = 0 + max_count = 32 + while True: + data = self.pbs_statserver(pbs_server, attrl_name) + if data.lower() == attropl_value.lower(): + break + time.sleep(delay) + count += 1 + if count >= max_count: + raise Exception(f"error setting value, check count exceeded:{count}") + + def pbs_submit(self, job_script, attropl_lst): + pbs_ifl = self.pbs_ifl + prev_attropl = None + attropl = pbs_ifl.attropl() + attropl_first = attropl + for attropl_dct in attropl_lst: + attropl.name = attropl_dct["name"] + attropl.value = attropl_dct["value"] + if "resource" in attropl_dct: + attropl.resource = attropl_dct["resource"] + if prev_attropl: + prev_attropl.next = attropl + prev_attropl = attropl + attropl = pbs_ifl.attropl() + data = pbs_ifl.pbs_submit(self.conn, attropl_first, job_script, "", "") + # todo: figure out how to check for errors and not mess up the return. + # self.check_for_error(pbs_ifl.pbs_submit, data) + return data + + def pbs_delete(self, jobid, wait=False, delay=0.1): + pbs_ifl = self.pbs_ifl + status = pbs_ifl.pbs_deljob(self.conn, jobid, None) + if status == 0: + if wait: + count = 0 + max_count = 32 + + def handler(err): + errno = self.pbs_ifl.get_pbs_errno() + if errno == 15001: + pass + elif errno == 15139 and str(err) == "'pbs_ifl errno:15139 txt:Job has finished'": + pass + else: + raise (err) + + while True: + data = self._get_pbs_job_data(jobid, None, None, handler) + if jobid not in data: + break + time.sleep(delay) + count += 1 + if count >= max_count: + raise Exception(f"Error waiting for job {jobid} to be deleted, check count exceeded:{count}") + else: + pass # we are not waiting, just returning, we already have a status of 0. + else: + raise Exception(f"Job {jobid} was not successfully deleted:{status}") + + +def process_standard_bstat_head(bstat_head, pbs_ifl): + data = {} + bstat_root = bstat_head + while bstat_head: + dct = {} + if bstat_head: + attrl = bstat_head.attribs + while attrl: + key = attrl.name + resource = attrl.resource + value = attrl.value + if not resource: + if key in dct: + raise Exception(f"key:{key} is already set and resource didn't exist on the previous loop.") + dct[key] = value + else: + if key not in dct: + dct[key] = {} + dct[key].update( + { + resource: value, + } + ) + # # this was a different way. It's building a dictionary. + # if key in dct: + # raise Exception(f"key:{key} is already set and resource existed on the previous loop.") + # dct[key] = { + # "value": value, + # "resource": resource, + # } + attrl = attrl.next + data[bstat_head.name] = dct + bstat_head = bstat_head.next + if type(bstat_root) == pbs_ifl.batch_status: + pbs_ifl.pbs_statfree(bstat_root) + return data + + +def autodetect_interface(machine_name=None, debug=False, host=None) -> PBSInterfaceABC: + try: + interface = PBSIFLInterface(machine_name=machine_name, host=host, debug=debug) + except ImportError as err: # failing to import pbs_ifl + print(f"failed to import pbs ifl interface, err:{err}") + try: + interface = PBSCommandInterface(machine_name=machine_name, debug=debug) + except ImportError as err: + print(f"failed to import pbs command interface, err:{err}") + interface = None + return interface + + +def pbs_import_hook_prepare(hook_path: Path, pbs_home: str, callbacks: Optional[list] = None): + """creates a temporary file in the pbs_home, copies the source file to that file, + executes callbacks on that file, then imports it.""" + fd, path = tempfile.mkstemp(prefix=hook_path.stem, suffix=hook_path.suffix, dir=pbs_home) + with os.fdopen(fd, "wb") as f_dst: + f_dst.write(hook_path.read_bytes()) + hook_path_dst = Path(path) + hook_file_dst = hook_path_dst.name + for callback in callbacks: + callback(hook_path_dst) + return hook_file_dst + + +def pbs_import_hook_execute(pbs_interface, hook_name, hook_file, content_encoding, content_type): + pbs_ifl = pbs_interface.pbs_ifl + attropl = pbs_interface.get_attropl("input-file", value=hook_file, batch_op=pbs_ifl.SET, next_attropl=None) + attropl = pbs_interface.get_attropl("content-encoding", value=content_encoding, batch_op=pbs_ifl.SET, next_attropl=attropl) + attropl = pbs_interface.get_attropl("content-type", value=content_type, batch_op=pbs_ifl.SET, next_attropl=attropl) + data = pbs_interface.pbs_manager(pbs_ifl.MGR_CMD_IMPORT, pbs_ifl.MGR_OBJ_HOOK, object_name=hook_name, attropl=attropl) + return data + + +def pbs_load_enums(pbs=None): + if pbs is None: + import pbs + enum_lookup = {} + masks = ("REVERSE_NODE_STATE", "REVERSE_ATR_VFLAGS", "REVERSE_HOOK_EVENT") + for key in ( + "REVERSE_ATR_VFLAGS", + "REVERSE_BATCH_OPS", + "REVERSE_BRP_CHOICES", + "REVERSE_HOOK_EVENT", + "REVERSE_JOB_STATE", + "REVERSE_JOB_SUBSTATE", + "REVERSE_MGR_CMDS", + "REVERSE_MGR_OBJS", + "REVERSE_NODE_STATE", + "REVERSE_RESV_STATE", + ): + dct = {} + mask = "MASK" if key in masks else "LONG" + for i, value in getattr(pbs, key).items(): + dct[value] = i + enum_lookup[f"{mask}__{key}"] = dct + return enum_lookup diff --git a/PBS_Utils/pbs_log.py b/PBS_Utils/pbs_log.py new file mode 100644 index 0000000000000000000000000000000000000000..a37dd50ff150c23a3828ddb06af2d721aa901a15 --- /dev/null +++ b/PBS_Utils/pbs_log.py @@ -0,0 +1,105 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import errno +import re +from functools import partial +from PBS_Utils.pbs_enum import PBS_BATCH, PBS_SCHED, PBS_REPLY, DIS_ERR +from PBS_Utils.pbs_enum import LogEventTypes + +try: + import pbs +except ImportError: + pbs = None + + +def resolver(enum, template, match): + result = None + match_group = match.group(1) + if match_group is not None: + try: + result = template.format(name=enum(int(match_group)).name, orig=match_group) + except: + result = template.format(name="???", orig=match_group) + result = result.replace("PBS_BATCH_", "") + return result + + +def resolver_f(func, template, match): + result = None + value = match.group(1) + if value is not None: + try: + result = template.format(name=func(int(value)), orig=value) + except: + result = template.format(name="???", orig=value) + return result + + +def convert_state(template, match): + result = None + if match: + vnode_state_lst = [] + value = match.group(1) + value_int = int(value, 16) + # bit_indexes = [2**i for i in range(16) if value_int & 2**i > 0] + if value_int == pbs.ND_STATE_FREE: + vnode_state_lst.append("ND_STATE_FREE") + else: + vnode_state_lst = [val for (mask, val) in sorted(pbs.REVERSE_NODE_STATE.items()) if value_int & mask] + try: + vnode_state_str = ",".join(vnode_state_lst) + result = template.format(name=vnode_state_str, orig=value) + except: + result = template.format(name="???", orig=value) + return result + + +def ERRNO(value): + if value == 0: + result = "0" + else: + result = errno.errorcode[value] + return result + + +DECODERS = [ + # all decoders must have {name}({orig}) + ("Req;;Type {name}({orig})", re.compile(r"Req;;Type (\d+)"), partial(resolver, PBS_BATCH)), + ("Scheduler sent command {name}({orig})", re.compile(r"Scheduler sent command (\d+)"), partial(resolver, PBS_SCHED)), + ("Reject reply code={name}({orig})", re.compile(r"Reject reply code=(\d+)"), partial(resolver, PBS_REPLY)), + ("dis error {name}({orig})", re.compile(r"dis error (\d+)"), partial(resolver, DIS_ERR)), + ("errno {name}({orig})", re.compile(r"errno[ =](\d+)"), partial(resolver_f, ERRNO)), +] +if pbs: + DECODERS.append((".state={name}({orig})", re.compile(r"\.state=0x([0-9a-fA-F]+)"), convert_state)) + DECODERS.append(("state_bits={name}({orig})", re.compile(r"state_bits=0x([0-9a-fA-F]+)"), convert_state)) +# Updated job state to 72 and substate to 20 + +DECODERS = list( + [(template.replace("{name}({orig})", ""), template, pattern, partial(func, template)) for template, pattern, func in DECODERS] +) + + +def process_log_event_type(line): + try: + ts, event_type, tail = line.split(";", 2) + event_type_int = int(event_type, 16) + # LogEventTypes + event_type_lst = [] + for key, value in dict([(e.name, e.value) for e in LogEventTypes]).items(): + if event_type_int & value: + event_type_lst.append(key.replace("PBSEVENT_", "")) + line = f"{ts};{','.join(event_type_lst)}({event_type});{tail}" + except: + pass + return line + + +def process_log_line(line): + for precheck, template, pattern, func in DECODERS: + if precheck in line: + line = re.sub(pattern, func, line) + line = process_log_event_type(line) + return line diff --git a/PBS_Utils/pbs_mq.py b/PBS_Utils/pbs_mq.py new file mode 100644 index 0000000000000000000000000000000000000000..1455f64e812b3d77e2bcfd30792d13b62006c672 --- /dev/null +++ b/PBS_Utils/pbs_mq.py @@ -0,0 +1,84 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import copy +import getpass +import json +import os +import socket +import traceback +import uuid +from datetime import datetime +from PBS_Utils.pbs_util import datetime_to_str, get_now + + +class EmitterRabbitMQ: + def __init__(self, pika, rabbitmq_host, rabbitmq_port, event_type, debug_func=None, error_func=None): + self.pika = pika + self.rabbitmq_host = rabbitmq_host + self.rabbitmq_port = rabbitmq_port + self.event_type = event_type + self.error_func = error_func + self.debug_func = debug_func + + def send(self, queue, json_dump): + pika = self.pika + rabbitmq_host = self.rabbitmq_host + rabbitmq_port = self.rabbitmq_port + connection = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host)) + try: + channel = connection.channel() + channel.queue_declare( + queue=queue, + durable=True, # True: D + auto_delete=False, # True: AD + ) + channel.basic_publish(exchange="", routing_key=queue, body=json_dump) + except Exception: + lines = traceback.format_exc().strip().splitlines() + for lineno, line in enumerate(lines): + self.message_error(f"Error sending rabbitmq message: #{lineno}:Exception: {line}") + finally: + connection.close() + + def get_metadata(self, message_type): + """could move this into its own function.""" + metadata_dct = { + "hostname": socket.gethostname(), + "timestamp": datetime_to_str(get_now()), + "pid": os.getpid(), + "ppid": os.getppid(), + "uuid": uuid.uuid4().hex, + "name": __name__, + "user": getpass.getuser(), + "message_type": message_type, + } + return metadata_dct + + def emit_pbs_log_line(self, record_queue, record_obj): + metadata_dct = self.get_metadata(record_queue) + dct = record_obj.get_dct() + dct["timestamp"] = datetime_to_str(dct["timestamp"]) + dct["message_type"] = "pbs" + json_dump = json.dumps({"metadata": metadata_dct, "data": dct}, default=str) + self.send(record_queue, json_dump) + + def emit_system(self, record_queue, ts: datetime, te: datetime, dct): + metadata_dct = self.get_metadata(record_queue) + metadata_dct["ts"] = datetime_to_str(ts) + metadata_dct["te"] = datetime_to_str(te) + if self.event_type != "HOOK_EVENT_PERIODIC": + dct = copy.deepcopy(dct) + # this structure has metadata, ps, event, server, hook keys. + dct["metadata"] = metadata_dct + json_dump = json.dumps(dct, default=str) + self.send(record_queue, json_dump) + + def message_error(self, msg): + if self.error_func: + self.error_func(msg) + + def message_debug(self, msg): + if self.debug_func: + self.debug_func(msg) diff --git a/PBS_Utils/pbs_util.py b/PBS_Utils/pbs_util.py new file mode 100644 index 0000000000000000000000000000000000000000..3460d38bfb3b45bed87f8ff91a93d582e4913813 --- /dev/null +++ b/PBS_Utils/pbs_util.py @@ -0,0 +1,109 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import os +import uuid +import socket +import getpass +from datetime import datetime, timedelta, timezone + +HOSTNAME = socket.gethostname() +utc_tz = timezone(timedelta(seconds=0), name="UTC") + + +def get_now(notz=False) -> datetime: + """Return a datetime object right now as the local server time then convert it to utc""" + if notz: + result = datetime.utcnow() + else: + result = datetime.now(utc_tz) + return result + + +def get_now_day_str() -> str: + datetime_now = get_now() + datetime_str = datetime_now.strftime("%Y%m%d") + return datetime_str + + +def datetime_to_str(obj): + return obj.strftime("%Y-%m-%dT%H:%M:%S.%f") + + +def get_metadata(machine_name, message_type): + metadata_dct = { + "hostname": HOSTNAME, + "timestamp": get_now().strftime("%Y-%m-%d %H:%M:%S.%f"), + "pid": os.getpid(), + "ppid": os.getppid(), + "uuid": uuid.uuid4().hex, + "machine_name": machine_name, + "message_type": message_type, + "user": getpass.getuser(), + } + return metadata_dct + + +def timedelta_str_to_td(td_str: str): + """Convert a timedelta string, such as a walltime into a timedelta.""" + if td_str.startswith("-"): + negative = True + td_str = td_str.lstrip("-") + else: + negative = False + if "." in td_str: + dhms, us = td_str.split(".", 1) + else: + us = 0.0 + dhms = td_str + + if "days" in dhms: + d, hms = dhms.split(" days, ", 1) + elif "day" in dhms: + d, hms = dhms.split(" day, ", 1) + else: + d = "0" + hms = dhms + h, m, s = hms.split(":", 2) + d = int(d) + h = int(h) + m = int(m) + s = int(s) + us = int(us) + if negative: + d = -d + td = timedelta(days=int(d), hours=int(h), minutes=int(m), seconds=int(s), microseconds=us) + return td + + +def timedelta_to_str(td: timedelta): + """Convert a timedelta into a string.""" + days = td.days + seconds = td.seconds + hours, seconds = divmod(seconds, 3600) + minutes, seconds = divmod(seconds, 60) + microseconds = td.microseconds + days = int(days) + hours = int(hours) + minutes = int(minutes) + seconds = int(seconds) + microseconds = round(microseconds) + if days: + if days > 1: + day_str = "days" if days > 1 else "day" + else: + day_str = "days" if days < -1 else "day" + result = f"{days} {day_str}, {hours:0>2}:{minutes:0>2}:{seconds:0>2}" + else: + result = f"{hours:0>2}:{minutes:0>2}:{seconds:0>2}" + if microseconds: + result = f"{result}.{microseconds:0>6}" + return result + + +def df_to_lstofdct(df): + """convert a dataframe to a list of dct""" + df = df.reset_index() + lst_of_dct = list(df.T.to_dict().values()) + return lst_of_dct diff --git a/PBS_Utils/scripts/README.md b/PBS_Utils/scripts/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3e7ebef90d8a9035cc32e8ac21f524e3b92ee0e7 --- /dev/null +++ b/PBS_Utils/scripts/README.md @@ -0,0 +1,150 @@ +<!--- +Copyright (C) 2024, UChicago Argonne, LLC +Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +in the top-level directory. +---> + +## ---- plqstat.py ---- +### show me the list of jobs like qstat +python plqstat.py + +### sow me the list of jobs with score and comment. FIXME: make comment more generate. in header? +python plqstat.py --score --comment + + +## ---- plnodes.py ---- +### give me simular output to pbsnodes +python plnodes.py + +### output the nodes with the additional resource ncpus +python plnodes.py -r ncpus + +### give a list of vnodes, one per line. #FIXME: add delim other than \n +python plnodes.py --header vnode --bare + + +## ---- pllog.py ---- +### enhance the log and dump it to a file. +python pllog.py pipe --pipein /var/spool/pbs/server_logs/20231004 --pipeout /tmp/out.txt + +### enhance the log and dump it to a stdout +python pllog.py pipe --pipein /var/spool/pbs/server_logs/20231004 + +### enhance the log and dump it to a stdout. gzip is only supported via --pipein +python pllog.py pipe --pipein /var/spool/pbs/server_logs/20230925.gz + +### process a log via pipe. +cat /var/spool/pbs/server_logs/20231004 | python pllog.py pipe + + +## ---- plquery.py ---- + +### this is where data will be dumped and read from using -l or a specific prefix +export PLPATH=/data + +### --filt applies a logical_and of the selectors. +### --filt-or applies a logical_or of the selectors. +### --agg, --groupby are quite complex. +### --bare should be used with a single header value to create a list. You can change the delimiter using --delim. + +### first decide if you want to query the system every time or use cached data. +### if you want to use latest, set the variable to True and dump the data, it will go into PLPATH +export PLLATEST=True +python plquery.py dump + +### dump data with pbsnodes and qstat +python plquery.py dump --dpn --dqs + +### from this point on, if PLLATEST is True, it will use the data in PLPATH. + +### what are the jobs on the system and vnode if running. +python plquery.py -l job + +### what is the state of all jobs and count of nodes if running +python plquery.py -l --groupby jobid --groupby job_state --agg vnode=count job + +### what size job can I run? +python plquery.py -l --agg isavailable=sum avail + +python plquery.py -l --groupby jobid --groupby job_state --agg vnode=count job + +### which nodes are free? +python plquery.py -l --filt isavailable=1 avail + +### give me a list of nodes that are free in a comma separated list. +python plquery.py -l --filt isavailable=1 --header vnode --bare avail + +### special command +### pbsnodes -a -F json | jq -r '.nodes|.[]|select(.resources_available.broken == "True")|[.resources_available.host, .state, .comment] | join(",")' +python plquery.py -l --header vnode,resources_available.broken --filt resources_available.broken=False vnode +python plquery.py -l --header vnode --filt resources_available.broken=False --header vnode --bare vnode +python plquery.py -l --header vnode,resources_available.broken --filt resources_available.broken=True vnode + +### all queues with all jobs and their states. +python plquery.py -l queue-jobs-agg + +### all queues with only jobs in the running state +python plquery.py -l -f job_state=running queue-jobs-agg + +### how many possible nodes are in each queue and how many are available. +python plquery.py -l queue-avail + +### what size job can I run for queue x? +python plquery.py -l --filt queue=somequeue1 queue-avail + +### How many nodes are free in all queues? +python plquery.py -l --groupby isavailable --agg isavailable=count avail +### How many nodes are free in all queues and some weird sum of the available nodes, not useful. +python plquery.py -l --groupby isavailable --agg isavailable=count,sum avail + +### How many nodes are under a reservation, but the node state is not correct? +python plquery.py -l -vvv --filt node_state=free --filt isavailable=0 avail +python plquery.py -l -vvv --filt node_state=free --filt isavailable=0 --agg vnode=count avail + +### how many nodes are available to a reservation +python plquery.py -l --filt queue=somequeue1 queue-avail + +### which nodes are free for my reservation queue? +python plquery.py -l --filt queue=somequeue1 avail + +### how many nodes are "down" +python plquery.py -l --agg isdown=count,sum --filt isdown=1 avail --filt-in-or node_state_lst=down,offline,state-unknown,Stale + +### how many nodes are "down" or isavailable=0,1. Not useful but shows what could be done. +python plquery.py -l --agg isdown=count,sum --filt isdown=1 avail --filt-in-or node_state_lst=down,offline,state-unknown,Stale --filt-in-or isavailable=0,1 + +### give me the list of vnodes with the validation resource = True +python plquery.py -l --filt resources_available.validation=True --header vnode,comment vnode + +### give me all the down nodes and comments. +python plquery.py -l --header last_state_change_time,last_used_time,vnode,node_state,comment vnode|egrep "(down|offline)"|grep -v "free"|sort -k 1 -t " " + +### give me the list of nodes in a queue with a default_chunk +python plquery.py -l --filt queue=validation avail +python plquery.py -l --filt queue=validation --filt isavailable=1 avail + +### give me the list of nodes that are available in the queue +python plquery.py -l --header vnode --bare --filt queue=validation --filt isavailable=1 avail +python plquery.py -l --header vnode --bare --bare-delim ☃ --filt queue=validation --filt isavailable=1 avail + +### give me the vnode information for the 3 vnodes +python plquery.py -l --filt queue=workq --filt-or vnode=pdw-c02 --filt-or vnode=pdw-c01 --filt-or vnode=pdw-c03 avail + +### give me the vnode information for the 3 vnodes, but only if it's available. +python plquery.py -l --filt queue=workq --filt isavailable=1 --filt-or vnode=pdw-c02 --filt-or vnode=pdw-c01 --filt-or vnode=pdw-c03 avail + +### give me all the jobs that have not progresses in walltime, stuck +python plquery.py -l job-special --option stuck + +### give me all the jobs that have not progresses in walltime, stuck, just the jobid +python plquery.py -l --header jobid --bare job-special --option stuck + +## ---- plctrl.py ---- +### update the mom config with the debug level of 4095 along with 3 --usecp's, restarts pbs #FIXME: restart correctly. +python plctrl.py mom-config 4095 --usecp '*:/fs/ /fs/' --usecp '*:/home/ /home/' --usecp '*:/dev/null /dev/null' + +### create many moms, 10624 to be exact. Probably requires sudo. --pstart is the start port and --pend is the port end. +plctrl.py create-moms --mom pdw-c01 --name pdw-c01 --pstart 15005 --pend 25629 + +### delete those 10624 moms +plctrl.py delete-moms --mom pdw-c01 --name pdw-c01 --pstart 15005 --pend 25629 diff --git a/PBS_Utils/scripts/__init__.py b/PBS_Utils/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c3deec7350020ec11fe2a6caafcfde517f41e465 --- /dev/null +++ b/PBS_Utils/scripts/__init__.py @@ -0,0 +1,3 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. diff --git a/PBS_Utils/scripts/placct.py b/PBS_Utils/scripts/placct.py new file mode 100644 index 0000000000000000000000000000000000000000..636e1daa675c18702b928915a4eda472add907cf --- /dev/null +++ b/PBS_Utils/scripts/placct.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import gzip +from pprint import pformat +from glob import glob +import click +import logging +from dataclasses import dataclass +from pathlib import Path +from multiprocessing import Queue, Process, Manager + +logging.basicConfig() +log = logging.getLogger(__name__) + +from PBS_Utils.pbs_accounting import parse_pbs_log_line, fix_newlines_extract_time + + +@dataclass +class Settings: + def __init__(self) -> None: + self.verbose: int = 0 + + +pass_settings = click.make_pass_decorator(Settings, ensure=True) + + +@click.group() +@pass_settings +@click.option("--verbose", "-v", help="Increase verbosity level", count=True) +def cli(settings: Settings, verbose: int) -> None: + settings.verbose = verbose + if settings.verbose > 2: + log.setLevel(logging.DEBUG) + elif settings.verbose > 1: + log.setLevel(logging.INFO) + else: + log.setLevel(logging.WARN) + + +class WorkComplete: + def __init__(self, count): + self.count = count + + +class FileProcessor: + def __init__(self, work_queue: Queue, log_queue: Queue, err_queue: Queue): + self.work_queue = work_queue + self.log_queue = log_queue + self.err_queue = err_queue + + def process_file(self, file): + path = Path(file) + if str(path).endswith(".gz"): + opener = gzip.open + else: + opener = open + + with opener(path, "rb") as f: + contents = f.read().decode("utf-8") + + lines = contents.split("\n") + # fixes linebreaks + times_and_data = fix_newlines_extract_time(lines, include_lineno=True) + for logtime, lineno, data in times_and_data: + try: + parse_dct = parse_pbs_log_line(logtime, data) + except Exception as err: + line = lines[lineno] + self.err_queue.put(f"X:{file}:{logtime}#{lineno}\n {err=} \n{line=} \n{data=}") + else: + self.log_queue.put(f"O:{file}:{logtime}#{lineno}\n {pformat(parse_dct, indent=4)}") + return len(times_and_data) + + +def worker(fp, work_queue: Queue, log_queue: Queue, err_queue: Queue): + total_result = 0 + while not work_queue.empty(): + # ... race ... + work = work_queue.get(timeout=0.1) + print(".", end="") + result = fp.process_file(work) + total_result += result + log_queue.put(WorkComplete(total_result)) + err_queue.put(WorkComplete(total_result)) + + +def worker_logger(log_queue: Queue, complete_count): + while complete_count > 0: + message = log_queue.get() + if type(message) == WorkComplete: + complete_count -= 1 + log.info(f"worker completed {message.count} lines.") + else: + log.debug(message) + + +def worker_logger_error(err_queue: Queue, complete_count): + while complete_count > 0: + message = err_queue.get() + if type(message) == WorkComplete: + complete_count -= 1 + else: + log.error(message) + + +@cli.command() +@pass_settings +@click.argument("files", nargs=-1) +def test_parse(settings: Settings, files): + with Manager() as manager: + file_lst = [] + for file in files: + file_lst.extend(glob(file)) + work_queue = manager.Queue() + log_queue = manager.Queue() + err_queue = manager.Queue() + + fp = FileProcessor(work_queue, log_queue, err_queue) + for file in files: + work_queue.put(file) + processes = 8 # mp.cpu_count() + process_lst = [] + for i in range(processes): + process_lst.append(Process(target=worker, args=(fp, work_queue, log_queue, err_queue))) + + for process in process_lst: + process.start() + + process = Process(target=worker_logger, args=(log_queue, processes)) + process.start() + process_lst.append(process) + + process = Process(target=worker_logger_error, args=(err_queue, processes)) + process.start() + process_lst.append(process) + + for process in process_lst: + process.join() + + +if __name__ == "__main__": + cli() diff --git a/PBS_Utils/scripts/plctrl.py b/PBS_Utils/scripts/plctrl.py new file mode 100755 index 0000000000000000000000000000000000000000..61eb1de85593cc4930e6cd188be56866feddf808 --- /dev/null +++ b/PBS_Utils/scripts/plctrl.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import subprocess +import sys +from pathlib import Path +import shlex + +import click +import logging +from dataclasses import dataclass + +from PBS_Utils.pbs_util import get_now +from PBS_Utils.pbs_library import PBSIFLInterface, PBSIFLError + +logging.basicConfig() +log = logging.getLogger(__name__) + + +@dataclass +class Settings: + def __init__(self) -> None: + self.verbose: int = 0 + # self.restart: bool = False + + +pass_settings = click.make_pass_decorator(Settings, ensure=True) + +# @click.option('-r', '--restart', is_flag=True, show_default=True, default=True, help="restart pbs.") # fixme move to the top cli + + +@click.group() +@pass_settings +@click.option("--verbose", "-v", help="Increase verbosity level", count=True) +def cli(settings: Settings, verbose: int) -> None: + settings.verbose = verbose + if settings.verbose >= 2: + log.setLevel(logging.DEBUG) + elif settings.verbose == 1: + log.setLevel(logging.INFO) + else: + log.setLevel(logging.WARN) + + +@cli.command() +@pass_settings +@click.option("-c", "--usecp", type=click.STRING, multiple=True) +@click.option( + "--path", + type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), + default="/var/spool/pbs/mom_priv/config", +) +@click.argument("logevent", type=click.STRING) +def mom_config(settings: Settings, logevent, usecp, path: Path): + restart = True + logevent = f"$logevent {logevent}" + new_cp_dct = {} + for cp in usecp: + try: + nsrc, ndst = cp.split(" ") + except ValueError: + print(f"please provide a correct usecp line: hostspec:path_prefix new_prefix") + sys.exit(1) + else: + new_cp_dct[nsrc] = ndst + + line_lst = path.read_text().split("\n") + + new_lines = [] + new_lines.append(logevent) + + old_cp_dct = {} + for line in line_lst: + if not line: + continue + elif line.startswith("$logevent"): + continue + elif line.startswith("$usecp"): + try: + ekey, esrc, edst = line.split(" ") + except ValueError: + print(f"invalid usecp line found, removing:{line}") + continue + else: + old_cp_dct[esrc] = edst + else: + new_lines.append(line) + + for esrc, edst in old_cp_dct.items(): + if esrc not in new_cp_dct: + new_lines.append(f"$usecp {esrc} {edst}") + else: + ndst = new_cp_dct[esrc] + new_lines.append(f"$usecp {esrc} {ndst}") + del new_cp_dct[esrc] + + for nsrc, ndst in new_cp_dct.items(): + new_lines.append(f"$usecp {nsrc} {ndst}") + + contents = ("\n".join(new_lines)) + "\n" + path.write_text(contents) + + # restart = False + if restart: + _restart_pbs(output=True) + + +# @cli.command() +# @pass_settings +# @click.option('--name', type=click.STRING) +# def mom_configd(settings: Settings, name): +# pass + + +def _restart_pbs(output=False): + cp = subprocess.run(["/etc/init.d/pbs", "restart"], capture_output=True) + if output: + print(f"stderr:{cp.stderr.decode('utf-8')}") + print(f"stdout:{cp.stdout.decode('utf-8')}") + print(f"return:{cp.returncode}") + return cp + + +@cli.command() +@pass_settings +def restart_pbs(settings: Settings): + cp = _restart_pbs(output=True) + return cp.returncode + + +# @cli.result_callback() +# def exit(result, **kwargs): +# if 'restart' in kwargs: +# restart() + + +@cli.command() +@click.option("--mom", type=click.STRING) +@click.option("--name", type=click.STRING) +@click.option("--pstart", type=click.INT) +@click.option("--pend", type=click.INT) +@pass_settings +def create_moms(settings: Settings, mom, name, pstart, pend): + fast = True + if fast: + import pbs_ifl + + pbs_interface = PBSIFLInterface(pbs_ifl=pbs_ifl) + else: + pbs_interface = None + time_now = get_now() + time_start = time_now + for i in range(pstart, pend + 1): + vnode = f"{name}-{i}" + if fast: + attropl = pbs_interface.get_attropl("mom", value=mom) + attropl = pbs_interface.get_attropl("port", value=str(i), next_attropl=attropl) + # attropl = pbs_interface.get_attropl('state', value='down', next_attropl=attropl) + data = pbs_interface.pbs_manager(pbs_ifl.MGR_CMD_CREATE, pbs_ifl.MGR_OBJ_NODE, object_name=vnode, attropl=attropl) + msg = data + else: + cmd = f"/opt/pbs/bin/qmgr -c 'create node {vnode} mom={mom},port={i}'" + cp = subprocess.run(shlex.split(cmd), capture_output=True) + if cp.returncode != 0: + print(cp.returncode) + print(cp.stdout.decode("utf-8")) + print(cp.stderr.decode("utf-8")) + break + msg = cp.stdout.decode("utf-8") + time_prv = time_now + time_now = get_now() + time_delta = (time_now - time_prv).total_seconds() + print(f"{time_now}Δ({time_delta: >4}):created vnode {vnode} on {mom}: {msg}") + time_end = get_now() + time_delta = (time_end - time_start).total_seconds() + print(f"total Δ({time_delta: >4})") + + +@cli.command() +@click.option("--mom", type=click.STRING) +@click.option("--name", type=click.STRING) +@click.option("--pstart", type=click.INT) +@click.option("--pend", type=click.INT) +@pass_settings +def delete_moms(settings: Settings, mom, name, pstart, pend): + fast = True + if fast: + import pbs_ifl + + pbs_interface = PBSIFLInterface(pbs_ifl=pbs_ifl) + else: + pbs_interface = None + time_now = get_now() + time_start = time_now + for i in range(pstart, pend + 1): + vnode = f"{name}-{i}" + if fast: + try: + data = pbs_interface.pbs_manager(pbs_ifl.MGR_CMD_DELETE, pbs_ifl.MGR_OBJ_NODE, object_name=vnode) + except PBSIFLError as err: + msg = str(err) + else: + msg = data + else: + cmd = f"/opt/pbs/bin/qmgr -c 'delete node {vnode}'" + cp = subprocess.run(shlex.split(cmd), capture_output=True) + if cp.returncode != 0: + print(cp.returncode) + print(cp.stdout.decode("utf-8")) + print(cp.stderr.decode("utf-8")) + break + msg = cp.stdout.decode("utf-8") + time_prv = time_now + time_now = get_now() + time_delta = (time_now - time_prv).total_seconds() + print(f"{time_now}Δ({time_delta: >4}):deleted vnode {vnode} on {mom}: {msg}") + time_end = get_now() + time_delta = (time_end - time_start).total_seconds() + print(f"total Δ({time_delta: >4})") + + +if __name__ == "__main__": + cli() diff --git a/PBS_Utils/scripts/pllog.py b/PBS_Utils/scripts/pllog.py new file mode 100755 index 0000000000000000000000000000000000000000..3e22af236912fa86c8b0dd27992f4282215f292e --- /dev/null +++ b/PBS_Utils/scripts/pllog.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import io +import sys +import gzip +import click +import logging +from dataclasses import dataclass +from PBS_Utils.pbs_log import process_log_line + +try: + import pbs +except ImportError: + pbs = None + +logging.basicConfig() +log = logging.getLogger(__name__) + + +@dataclass +class Settings: + def __init__(self) -> None: + self.verbose: int = 0 + + +pass_settings = click.make_pass_decorator(Settings, ensure=True) + + +@click.group() +@pass_settings +@click.option("--verbose", "-v", help="Increase verbosity level", count=True) +def cli(settings: Settings, verbose: int) -> None: + settings.verbose = verbose + if settings.verbose > 2: + log.setLevel(logging.DEBUG) + elif settings.verbose > 1: + log.setLevel(logging.INFO) + else: + log.setLevel(logging.WARN) + + +@cli.command() +@pass_settings +@click.option("--pipein", type=click.File("rb"), default=sys.stdin) +@click.option("--pipeout", type=click.File("at"), default=sys.stdout) +def pipe(settings: Settings, pipein, pipeout): + f = gzip.open(pipein, "rb") + try: + line = f.readline() + except (TypeError, gzip.BadGzipFile): + f = pipein + try: + f.seek(0) # we read the first two bytes, give them back + except io.UnsupportedOperation: + pass # this was probably piped in. + line = f.readline() + while line: + if type(line) == bytes: + line = line.decode("utf-8") + line = process_log_line(line) + pipeout.write(line) + # pipeout.flush() + line = f.readline() + + +if __name__ == "__main__": + cli() diff --git a/PBS_Utils/scripts/plnodes.py b/PBS_Utils/scripts/plnodes.py new file mode 100755 index 0000000000000000000000000000000000000000..95fff5890752c8e6bcb448c476c293f1c2e0b0fc --- /dev/null +++ b/PBS_Utils/scripts/plnodes.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import click +import pbs_ifl +import logging +import pandas as pd +from dataclasses import dataclass +from PBS_Utils.pbs_library import PBSIFLInterface + +logging.basicConfig() +log = logging.getLogger(__name__) + +pd.set_option("display.max_rows", None) +pd.set_option("display.max_columns", None) +pd.set_option("display.width", None) +pd.set_option("display.max_colwidth", None) +pd.set_option("display.float_format", "{:,.9f}".format) + + +@dataclass +class Settings: + def __init__(self) -> None: + self.verbose: int = 0 + self.server = None + + +pass_settings = click.make_pass_decorator(Settings, ensure=True) + + +@click.group(invoke_without_command=True, context_settings={"max_content_width": 132}) +@pass_settings +@click.option("--server", help="server host to connect via ifl", default=None) +@click.option("--verbose", "-v", help="Increase verbosity level", count=True) +@click.option("--sortby", multiple=True, help="column(s) to sort by") +@click.option("--resources", "-r", multiple=True) +@click.option( + "--header", + show_default=True, + default="vnode,port,state,njobs,ncpus,jobs,resv,comment", + help="The header to display, comma separated list.", +) +@click.option("--bare", is_flag=True, show_default=True, default=False, help="no header") +def cli(settings: Settings, server, verbose: int, sortby: list, resources: list, header, bare) -> None: + settings.server = server + settings.verbose = verbose + if settings.verbose >= 2: + log.setLevel(logging.DEBUG) + elif settings.verbose == 1: + log.setLevel(logging.INFO) + else: + log.setLevel(logging.WARN) + + header = header.split(",") + for resource in resources: + header.append(resource) + + pbs_interface = PBSIFLInterface(host=settings.server, pbs_ifl=pbs_ifl, debug=log.level == logging.DEBUG) + result = pbs_interface.get_pbs_vnodes() + vnodes = result["data"] + node_lst = [] + for vnode_name, dct in vnodes.items(): + node_dct = {} + node_dct["port"] = dct["Port"] if "Port" in dct else None + node_dct["comment"] = "" if "comment" not in dct else dct["comment"][:16] + node_dct["vnode"] = vnode_name + node_dct["state"] = dct["state"] + node_dct["resv"] = "" if "resv" not in dct else dct["resv"] + if "jobs" in dct: + jobs = dct["jobs"] + job_lst = jobs.split(",") + job_lst = sorted(list(set(map(lambda _: _.split(".", 1)[0].strip(), job_lst)))) + else: + job_lst = [] + node_dct["njobs"] = len(job_lst) + node_dct["jobs"] = " ".join(job_lst) + + key = "ncpus" + ncpus_total = int(dct["resources_available"][key]) if key in dct["resources_available"] else 0 + ncpus_free = ncpus_total - int(dct["resources_assigned"][key]) + node_dct[key] = f"{ncpus_free}/{ncpus_total}" + + for key in resources: + if key in dct["resources_available"]: + resource_total = int(dct["resources_available"][key]) + resource_free = ( + resource_total - int(dct["resources_assigned"][key]) if key in dct["resources_assigned"] else resource_total + ) + node_dct[key] = f"{resource_free}/{resource_total}" + else: + node_dct[key] = "0/0" + node_lst.append(node_dct) + + df = pd.DataFrame(node_lst) + for col in header: + missing_cols = [] + if col not in df: + missing_cols.append(col) + if not missing_cols: + df = pd.DataFrame(df, columns=header) + else: + log.error(f"header failed, missing columns: {' '.join(missing_cols)}") + + if sortby: + missing_cols = [] + for col in sortby: + if col not in df: + missing_cols.append(col) + if not missing_cols: + df = df.sort_values(by=list(sortby)) + else: + log.error(f"sort failed, missing columns: {' '.join(missing_cols)}") + + if not df.empty: + print(df.to_string(index=False, header=not bare)) + else: + print("No nodes found.") + + +if __name__ == "__main__": + cli() diff --git a/PBS_Utils/scripts/plqstat.py b/PBS_Utils/scripts/plqstat.py new file mode 100755 index 0000000000000000000000000000000000000000..2ae39cc63c1cc0249cfbf3d3a411660b7ff549aa --- /dev/null +++ b/PBS_Utils/scripts/plqstat.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import copy +import datetime +import pbs_ifl +import argparse +import numpy as np +import pandas as pd +from pprint import pformat +from PBS_Utils.pbs_library import PBSIFLInterface +from PBS_Utils.pbs_accounting import cast_to_vnode_list +from PBS_Utils.pbs_util import timedelta_str_to_td, timedelta_to_str + + +pd.set_option("display.max_rows", None) +pd.set_option("display.max_columns", None) +pd.set_option("display.width", None) +pd.set_option("display.max_colwidth", None) +pd.set_option("display.float_format", "{:,.9f}".format) + + +def get_argparse(): + # FIXME: switch to click + parser = argparse.ArgumentParser() + parser.add_argument("-v", "--debug", dest="debug", action=argparse.BooleanOptionalAction) + # parser.add_argument('--json', action='store_true') + parser.add_argument("--history", dest="history", action=argparse.BooleanOptionalAction) + parser.add_argument("--vnodes", dest="vnodes", action=argparse.BooleanOptionalAction, help="show vnodes.") + parser.add_argument("--fulljobid", dest="fulljobid", action=argparse.BooleanOptionalAction, help="full jobid.") + parser.add_argument("--score", dest="score", action=argparse.BooleanOptionalAction) + parser.add_argument("--comment", dest="comment", action=argparse.BooleanOptionalAction) + parser.add_argument("-Q", "--queue", dest="queue", action=argparse.BooleanOptionalAction) + # TODO: header + # TODO: sortby + # TODO: add filter for held jobs, etc. + # TODO: add nodes needed. + args = parser.parse_args() + return args + + +def err_handler(err_str): + print(err_str, flush=True) + + +def calculate_remaining(walltime: str, time_used: str): + td_wall = timedelta_str_to_td(walltime) + td_used = timedelta_str_to_td(time_used) + return timedelta_to_str(td_wall - td_used) + + +def main(): + args = get_argparse() + debug = True if args.debug else False + # json_output = True if args.json else False + fulljobid = True if args.fulljobid else False + vnodes = True if args.vnodes else False + history = True if args.history else False + score = True if args.score else False + comment = True if args.comment else False + queue = True if args.queue else False + + pbs_interface = PBSIFLInterface(pbs_ifl=pbs_ifl, debug=debug) + + if queue: + mode = "queue" + short_name_lookup = { + "resources_assigned": "ra", + "resources_default": "rd", + "resources_max": "rmax", + "resources_min": "rmin", + } + result_queue = pbs_interface.get_pbs_queues() + queues_dct = result_queue["data"] + queue_lst = [] + for queue_name, queue_dct in queues_dct.items(): + new_queue_dct = {} + new_queue_dct["name"] = queue_name + for key, value in queue_dct.items(): + if key in short_name_lookup: + key = short_name_lookup[key] + if type(value) == dict: + for subkey, subvalue in value.items(): + new_queue_dct[f"{key}.{subkey}"] = subvalue + else: + new_queue_dct[key] = value + queue_lst.append(new_queue_dct) + df = pd.DataFrame(queue_lst) + df = df.set_index("name") + else: + mode = "job" + columns = [ + "jobid", + "Job_Name", + "User", + "Queued", + "Walltime", + "Eligible", + "Time_Use", + "Remain", + "Nodes", + "S", + "Account_Name", + "queue", + ] + if score: + columns.append("score") + sort_column = "score" + else: + sort_column = "jobid" + if vnodes: + columns.append("Vnodes") + if comment: + columns.append("comment") + if history: + result_jobs = pbs_interface.get_pbs_jobs(attrib=None, extend="xt", err_handler=err_handler, score=score) + else: + result_jobs = pbs_interface.get_pbs_jobs(attrib=None, extend="t", err_handler=err_handler, score=score) + jobs = result_jobs["data"] + job_lst = [] + for jobid, job_dct in jobs.items(): + job_dct = copy.deepcopy(job_dct) + try: + stime = job_dct.get("stime", None) + if not fulljobid and "." in jobid: + jobid, _ = jobid.split(".", 1) + job_dct["jobid"] = jobid + job_dct["S"] = job_dct["job_state"] + job_dct["Project"] = job_dct.get("Account_Name", None) + job_dct["User"], _ = job_dct["Job_Owner"].split("@") + job_dct["Queued"] = datetime.datetime.utcfromtimestamp(int(job_dct["qtime"])) + job_dct["Start"] = datetime.datetime.utcfromtimestamp(int(stime)) if stime else None + job_dct["Walltime"] = job_dct["Resource_List"].get("walltime", None) + job_dct["Eligible"] = job_dct["eligible_time"] + # This is odd, resources_used doesn't always exist when a job is running... + job_dct["Time_Use"] = job_dct.get("resources_used", {}).get("walltime", "00:00:00") # ' -- ' + if job_dct["Time_Use"] and job_dct["Walltime"] and job_dct["S"] == "R": + walltime = job_dct["Walltime"] + time_used = job_dct["Time_Use"] + job_dct["Remain"] = calculate_remaining(walltime, time_used) + else: + if job_dct["S"] == "E": + job_dct["Remain"] = " -- " + else: + job_dct["Remain"] = " -- " + job_dct["Time_Use"] = job_dct.get("resources_used", {}).get("walltime", " -- ") + job_dct["Nodes"] = job_dct["Resource_List"].get("nodect", None) + if vnodes: + vnode_lst = cast_to_vnode_list(job_dct["exec_vnode"]) if job_dct.get("exec_vnode", None) else [] + job_dct["Vnodes"] = ",".join(vnode_lst) + job_dct["comment"] = job_dct["comment"] if "comment" in job_dct else "" + job_lst.append(job_dct) + except Exception as err: + print(pformat(job_dct)) + raise + del jobs + df = pd.DataFrame(job_lst, columns=columns) + df = df.set_index("jobid") + df = df.replace(np.nan, "-", regex=True) + df = df.sort_values(sort_column, ascending=False) + + if not df.empty: + # if json_output: + # import json + # # FIXME: make this work. + # print(pformat(df.to_json(date_format='iso'))) + # else: + print(df) + else: + print(f"No {mode}s found.") + + +if __name__ == "__main__": + main() diff --git a/PBS_Utils/scripts/plquery.py b/PBS_Utils/scripts/plquery.py new file mode 100755 index 0000000000000000000000000000000000000000..ce6bafaee8cfa84293fca979c3dde07bedbf2856 --- /dev/null +++ b/PBS_Utils/scripts/plquery.py @@ -0,0 +1,1143 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +"""Experimental!""" + +import re +import gzip +import copy +import json +import logging +import os +import socket +import subprocess +import sys +import datetime +from collections import defaultdict +from dataclasses import dataclass +from functools import partial +from pprint import pformat +from typing import Dict, Tuple, List, Optional +import pandas as pd +import numpy as np +from pandas.core.groupby.generic import DataFrameGroupBy + +import click + +from PBS_Utils.pbs_library import PBSIFLInterface, pbs_load_enums +from PBS_Utils.pbs_util import get_now, df_to_lstofdct +from PBS_Utils.pbs_accounting import cast_to_vnode_list, LOG_REGEX +from PBS_Utils.pbs_accounting import epoch_to_datetime, hms_to_seconds + +logging.basicConfig() +log = logging.getLogger(__name__) + +np.seterr(over="raise", under="raise") +np.set_printoptions(threshold=sys.maxsize, linewidth=132) +pd.set_option("display.max_rows", None) +pd.set_option("display.max_columns", None) +pd.set_option("display.width", None) +pd.set_option("display.max_colwidth", None) +pd.set_option("display.float_format", "{:.3f}".format) + +""" +There needs to be a way to filter the data before the processing and after. --fin --fout? +Option to get all columns on output, this requires processing the data. +""" + + +@dataclass +class Settings: + def __init__(self) -> None: + self.verbose: int = 0 + self.server = None + self.pbs_interface = None + self.format_json = False + self.data_path = None + self.data_prefix = None + self.sortby = None + self.filt: dict = {} + self.filt_or: dict = {} + self.groupby: list = [] + self.agg: Dict[str, List[str]] = {} + self.header: list = [] + self.bare = False + self.bare_delim = "," + self.data_latest = False + + +pass_settings = click.make_pass_decorator(Settings, ensure=True) + + +def cb_filt(ctx, param, value) -> dict: + bad_args = [] + result = {} + for filt in value: + try: + col, value = filt.split("=", 1) + except ValueError: + bad_args.append(filt) + else: + result[col] = value + if bad_args: + raise click.BadParameter(f"filter must be key=value. Bad filts:{','.join(bad_args)}") + return result + + +def cb_filt_multiple(ctx, param, value) -> dict: + bad_args = [] + result = defaultdict(list) + for filt in value: + try: + col, value = filt.split("=", 1) + except ValueError: + bad_args.append(filt) + else: + result[col].append(value) + if bad_args: + raise click.BadParameter(f"filter must be key=value. Bad filts:{','.join(bad_args)}") + return result + + +def cb_agg(ctx, param, value) -> Dict[str, List[str]]: + bad_args = [] + result = {} + for filt in value: + try: + col, value = filt.split("=", 1) + except ValueError: + bad_args.append(filt) + else: + try: + value_lst = value.split(",") + except: + bad_args.append(filt) + else: + result[col] = value_lst + if bad_args: + raise click.BadParameter(f"agg must be key=value or key=value,value. Bad filts:{','.join(bad_args)}") + return result + + +def cb_header(ctx, param, value) -> List[str]: + if value: + value_lst = value.replace(" ", "").split(",") + else: + value_lst = [] + return value_lst + + +def cb_groupby(ctx, param, value) -> List[str]: + if value: + value_lst = list(value) + else: + value_lst = [] + return value_lst + + +def process_prefix(ctx, settings) -> List[str]: + data_path = settings.data_path + + if ctx.invoked_subcommand == "dump": + if not data_path: + raise click.BadParameter("--data-path or PLPATH is required.") + value = get_now().strftime("%Y%m%dT%H%M%S") + else: + if settings.data_prefix: + if not data_path: + raise click.BadParameter("--data-path or PLPATH is required.") + value = settings.data_prefix + elif settings.data_latest: + if not data_path: + raise click.BadParameter("--data-path or PLPATH is required.") + files = os.listdir(data_path) + pattern = re.compile(r"\d{8}T\d{6}_metadata.json") + latest_lookup = {} + for file_name in files: + match = pattern.match(file_name) + if match: + # this is just for order + ts, _ = file_name.split("_", 1) + ts = ts.replace("T", ".") + ts = float(ts) + latest_lookup[ts] = file_name + file_name = latest_lookup[max(latest_lookup.keys())] + value, _ = file_name.split("_", 1) + log.info(f"Using latest data {value} from {data_path}") + else: + value = None + if value: + value = f"{value}_" + return value + + +def cb_latest(ctx, param, value) -> bool: + if value: + value = True if value in ["True", True, 1, "1"] else False + else: + value = os.environ.get("PLLATEST", False) + value = True if value in ["True", True, 1, "1"] else False + return value + + +def cb_sortby(ctx, param, value) -> str: + return value + + +@click.group() +@click.pass_context +@pass_settings +@click.option("--server", default=None, help="server host to connect via ifl") +@click.option("--verbose", "-v", count=True, help="Increase verbosity level") +@click.option("--format-json", "-j", is_flag=True, show_default=True, default=False, help="JSON output") +@click.option("--data-path", default=lambda: os.environ.get("PLPATH", ""), help="The data path. This can be set via PLPATH") +@click.option("--sortby", "-s", help="--sortby job_state", callback=cb_sortby) +@click.option("--filt", "--filt-and", "-f", help="--filt queue=workq", multiple=True, callback=cb_filt) +@click.option("--filt-or", help="--filt-or vnode=abc --filt-or vnode=def", multiple=True, callback=cb_filt_multiple) +@click.option("--groupby", help="groupby", multiple=True, callback=cb_groupby) +@click.option("--agg", help="aggregate", multiple=True, callback=cb_agg) +@click.option("--header", show_default=True, callback=cb_header, help="The header to display, comma separated list.") +@click.option("--bare", is_flag=True, show_default=True, default=False, help="bare output which can be used for other commands.") +@click.option("--bare-delim", show_default=True, default=",", help="no header") +@click.option( + "--data-latest", + "-l", + is_flag=True, + show_default=True, + default=False, + callback=cb_latest, + help="Use the latest data in directory. The default can be set via PLLATEST", +) +@click.option("--data-prefix", help="Select a specific time.") +def cli( + settings: Settings, + ctx: click.Context, + server, + verbose: int, + format_json, + data_path, + sortby, + filt, + filt_or, + groupby, + agg, + header: list, + bare: bool, + bare_delim: str, + data_latest: bool, + data_prefix, +) -> None: + settings.server = server + settings.verbose = verbose + settings.format_json = format_json + settings.data_path = data_path + settings.data_latest = data_latest + settings.data_prefix = data_prefix + settings.data_prefix = process_prefix(ctx, settings) + settings.sortby = sortby + settings.filt = filt + settings.filt_or = filt_or + settings.groupby = groupby + settings.agg = agg + settings.header = header + settings.bare = bare + settings.bare_delim = bare_delim + if settings.verbose >= 2: + log.setLevel(logging.DEBUG) + elif settings.verbose == 1: + log.setLevel(logging.INFO) + else: + log.setLevel(logging.WARN) + + +def flatten_key_into_value(_dct: dict, key_type, value: str) -> dict: + dct = copy.deepcopy(_dct) + dct[key_type] = value + return dct + + +def flatten_dct(_dct: Dict, key_name: str, delim=".") -> Dict: + dct = {} + for key, value in _dct.items(): + dct[f"{key_name}{delim}{key}"] = value + return dct + + +def flatten_dct_of_dct(_dct: Dict) -> Dict: + dct = {} + for key, value in _dct.items(): + value_type = type(value) + if value_type in [dict]: + dct.update(flatten_dct(value, key)) + else: + dct[key] = value + return dct + + +def flatten_pbs_structure(data: dict, key_type="identifier"): + """given a pbs_ifl data structure in the format of an key that is the identifier, move + that identifier into key_type:identifier within the value dictionary with the rest of the fields. + Then take any values that are a dictionary and flatten them using the . format for depth.""" + lst = [] + for key, value in data.items(): + lst.append(flatten_key_into_value(value, key_type, key)) + lst = [flatten_dct_of_dct(dct) for dct in lst] + return lst + + +def convert_to_bool(value): + # return True if value in ['True', True, 1, '1'] else False + return 1 if value in ["True", True, 1, "1"] else 0 + + +def state_to_strings(enum_lookup, key, state): + mask = True if key.startswith("MASK") else False + lookup = enum_lookup[key] + lst = [] + for key, value in lookup.items(): + if mask: + if value & state: + lst.append(key) + else: + if value == state: + lst.append(key) + return lst + + +def short_id(value): + return value.split(".", 1)[0] + + +@dataclass +class PBSData: + df_jobs: pd.DataFrame + df_queues: pd.DataFrame + df_nodes: pd.DataFrame + df_vnodes: pd.DataFrame + df_reservations: pd.DataFrame + df_server: pd.DataFrame + enum_lookup: dict + metadata: dict + + +def query_all(pbs_interface) -> PBSData: + ts = get_now().strftime("%Y-%m-%dT%H:%M:%S.%f") + jobs = pbs_interface.get_pbs_jobs(score=True)["data"] + queues = pbs_interface.get_pbs_queues()["data"] + nodes = pbs_interface.get_pbs_nodes()["data"] + vnodes = pbs_interface.get_pbs_vnodes()["data"] + reservations = pbs_interface.get_pbs_reservations()["data"] + server = pbs_interface.get_pbs_server()["data"] + + jobs = flatten_pbs_structure(jobs, key_type="jobid") + queues = flatten_pbs_structure(queues, key_type="queue") + nodes = flatten_pbs_structure(nodes, key_type="node") + vnodes = flatten_pbs_structure(vnodes, key_type="vnode") + reservations = flatten_pbs_structure(reservations, key_type="resvid") + server = flatten_pbs_structure(server, key_type="server") + + df_jobs = pd.DataFrame(jobs) + if not df_jobs.empty: + df_jobs["jobid"] = df_jobs["jobid"].apply(short_id) + df_jobs = df_jobs.apply(pd.to_numeric, errors="ignore") + df_jobs["Rerunable"] = df_jobs["Rerunable"].apply(convert_to_bool) + df_jobs["jobid"] = df_jobs["jobid"].astype(object) + + df_queues = pd.DataFrame(queues) + df_queues = df_queues.apply(pd.to_numeric, errors="ignore") + + df_queues["started"] = df_queues["started"].apply(convert_to_bool) + df_queues["enabled"] = df_queues["enabled"].apply(convert_to_bool) + df_queues = df_queues.rename(columns={"started": "queue_started"}) + + df_nodes = pd.DataFrame(nodes) + df_nodes = df_nodes.apply(pd.to_numeric, errors="ignore") + df_nodes["resv_enable"] = df_nodes["resv_enable"].apply(convert_to_bool) + + df_vnodes = pd.DataFrame(vnodes) + df_vnodes = df_vnodes.apply(pd.to_numeric, errors="ignore") + df_vnodes["resv_enable"] = df_vnodes["resv_enable"].apply(convert_to_bool) + df_vnodes = df_vnodes.rename(columns={"state": "node_state"}) + + df_reservations = pd.DataFrame(reservations) + if not df_reservations.empty: + df_reservations["resvid"] = df_reservations["resvid"].apply(short_id) + df_reservations = df_reservations.apply(pd.to_numeric, errors="ignore") + df_reservations["resvid"] = df_reservations["resvid"].astype(object) + + df_server = pd.DataFrame(server) + df_server = df_server.apply(pd.to_numeric, errors="ignore") + df_server["scheduling"] = df_server["scheduling"].apply(convert_to_bool) + df_server["query_other_jobs"] = df_server["query_other_jobs"].apply(convert_to_bool) + df_server["job_history_enable"] = df_server["job_history_enable"].apply(convert_to_bool) + df_server["resv_enable"] = df_server["resv_enable"].apply(convert_to_bool) + df_server["flatuid"] = df_server["flatuid"].apply(convert_to_bool) + df_server["eligible_time_enable"] = df_server["eligible_time_enable"].apply(convert_to_bool) + + metadata = dict( + hostname=socket.gethostname(), + timestamp=ts, + ) + enum_lookup = pbs_load_enums() + pbsdata = PBSData(df_jobs, df_queues, df_nodes, df_vnodes, df_reservations, df_server, enum_lookup, metadata) + log.info(f"started queries at {ts}") + return pbsdata + + +def wrap_cast_to_vnode_list(value): + if pd.isna(value): + result = [] + else: + result = cast_to_vnode_list(value) + return result + + +def wrap_cast_to_vnode_count(value): + if pd.isna(value): + result = [] + else: + result = cast_to_vnode_list(value) + return len(result) + + +def split_column_lst_into_series(df, column, new_column): + """given a dataframe with a column that is a list, split that list into multiple series + and put the new rows back into the dataframe and remove the column. + TODO: look into the possible cartesian product here.. Need to remove dups after.""" + if not df.empty: + # ser = df[column].apply(pd.Series, 1).stack() # FutureWarning + ser = df[column].apply(lambda s: pd.Series(s, dtype=object), 1).stack() + ser.index = ser.index.droplevel(-1) + ser.name = new_column + df = df.join(ser) + df.drop([column], axis=1, inplace=True) + return df + + +# FIXME: add load and dump with hdf5, requires pytables. +def load_all(data_path, data_prefix): + log.info(f"loading prefix {data_prefix} from {data_path}") + df_jobs = pd.read_csv(os.path.join(data_path, f"{data_prefix}df_jobs.csv.gz"), dtype={"jobid": str}) + df_queues = pd.read_csv(os.path.join(data_path, f"{data_prefix}df_queues.csv.gz"), dtype={"queue": str}) + df_nodes = pd.read_csv(os.path.join(data_path, f"{data_prefix}df_nodes.csv.gz"), dtype={"node": str}) + df_vnodes = pd.read_csv(os.path.join(data_path, f"{data_prefix}df_vnodes.csv.gz"), dtype={"vnode": str}) + df_reservations = pd.read_csv(os.path.join(data_path, f"{data_prefix}df_reservations.csv.gz"), dtype={"resvid": str}) + df_server = pd.read_csv(os.path.join(data_path, f"{data_prefix}df_server.csv.gz")) + with gzip.open(os.path.join(data_path, f"{data_prefix}pbs_enums.json.gz"), "rb") as fd: + enum_lookup = json.loads(fd.read()) + with gzip.open(os.path.join(data_path, f"{data_prefix}metadata.json.gz"), "rb") as fd: + metadata = json.loads(fd.read()) + + if not df_jobs.empty: + df_jobs["jobid"] = df_jobs["jobid"].astype(object) + if not df_reservations.empty: + df_reservations["resvid"] = df_reservations["resvid"].astype(object) + pbsdata = PBSData(df_jobs, df_queues, df_nodes, df_vnodes, df_reservations, df_server, enum_lookup, metadata) + return pbsdata + + +def get_data(settings): + if settings.data_path and settings.data_prefix is not None: + pbsdata = load_all(settings.data_path, settings.data_prefix) + else: + pbs_interface = PBSIFLInterface(host=settings.server) + pbsdata = query_all(pbs_interface) + log.info(pformat(pbsdata.metadata)) + return pbsdata + + +@cli.command() +@pass_settings +@click.option("--dump-pbsnodes", "--dpn", is_flag=True, show_default=True, default=False, help="dump pbsnodes") +@click.option("--dump-qstat", "--dqs", is_flag=True, show_default=True, default=False, help="dump qstat") +def dump(settings: Settings, dump_pbsnodes, dump_qstat): + data_path = settings.data_path + data_prefix = settings.data_prefix + pbs_interface = PBSIFLInterface(host=settings.server) + pbsdata = query_all(pbs_interface) + df_jobs = pbsdata.df_jobs + df_queues = pbsdata.df_queues + df_nodes = pbsdata.df_nodes + df_vnodes = pbsdata.df_vnodes + df_reservations = pbsdata.df_reservations + df_server = pbsdata.df_server + enum_lookup = pbsdata.enum_lookup + metadata = pbsdata.metadata + lst = zip( + (df_jobs, df_queues, df_nodes, df_vnodes, df_reservations, df_server), + ("df_jobs", "df_queues", "df_nodes", "df_vnodes", "df_reservations", "df_server"), + ) + for df, df_name in lst: + fullpath = os.path.join(data_path, f"{data_prefix}{df_name}.csv.gz") + df.to_csv(fullpath, compression="gzip") + print(f"wrote {fullpath}") + + fullpath = os.path.join(data_path, f"{data_prefix}pbs_enums.json.gz") + with gzip.open(fullpath, "wb") as fd: + fd.write(json.dumps(enum_lookup).encode("utf-8")) + print(f"wrote {fullpath}") + + fullpath = os.path.join(data_path, f"{data_prefix}metadata.json.gz") + with gzip.open(fullpath, "wb") as fd: + fd.write(json.dumps(metadata).encode("utf-8")) + print(f"wrote {fullpath}") + + if dump_pbsnodes: + fullpath = os.path.join(data_path, f"{data_prefix}pbsnodes.json.gz") + cmd = f"set -o pipefail; pbsnodes -avF json | gzip -9c > {fullpath}" + ret = subprocess.run(cmd, shell=True) + if ret.returncode == 0: + print(f"wrote {fullpath}") + else: + print(f"failed to write pbsnodes output to {fullpath}") + if dump_qstat: + fullpath = os.path.join(data_path, f"{data_prefix}qstat.json.gz") + cmd = f"set -o pipefail; qstat -fF json | gzip -9c > {fullpath}" + ret = subprocess.run(cmd, shell=True) + if ret.returncode == 0: + print(f"wrote {fullpath}") + else: + print(f"failed to write qstat output to {fullpath}") + + +def job_state_to_word(value): + # taken from _base_types.py + if value == "T": + value = "transit" + elif value == "Q": + value = "queued" + elif value == "H": + value = "held" + elif value == "W": + value = "waiting" + elif value == "R": + value = "running" + elif value == "E": + value = "exiting" + elif value == "X": + value = "expired" + elif value == "B": + value = "begun" + elif value == "S": + value = "suspend" + elif value == "U": + value = "suspend_user" + elif value == "M": + value = "moved" + elif value == "F": + value = "finished" + return value + + +def logic_avail_nodes(ser) -> Tuple[int, str]: + jobid = ser["jobid"] if "jobid" in ser else None + queue_started = ser["queue_started"] if "queue_started" in ser else None + avail = 1 + comment_lst = [] + if ser["node_state"] != "free": + avail = 0 + comment_lst.append("Node state is not free.") + if ( + pd.notna(ser["node_state"]) + and "resv-exclusive" in ser["node_state"] + and (pd.isna(queue_started) or pd.isnull(queue_started)) + ): + avail = 0 + comment_lst.append("Node state is bad.") + if queue_started and pd.notna(queue_started) and pd.notnull(queue_started): + avail = 0 + comment_lst.append("Resv running.") + if jobid and pd.notna(jobid) and pd.notnull(jobid): + avail = 0 + comment_lst.append("Job running.") + comment = " ".join(comment_lst) + return avail, comment + + +def logic_avail_resv_nodes(ser) -> Tuple[int, str]: + jobid = ser["jobid"] if "jobid" in ser else None + queue_started = ser["queue_started"] if "queue_started" in ser else None + resvid = ser["resvid"] if "resvid" in ser else None + avail = 1 + comment_lst = [] + if ser["node_state"] != "resv-exclusive": + avail = 0 + comment_lst.append("Node state must be only resv-exclusive") + if resvid and "free" in ser["node_state"]: # opposite of problem above, reservation started, but node marked free. + avail = 0 + comment_lst.append("Node state is bad.") + if jobid and pd.notna(jobid) and pd.notnull(jobid): + avail = 0 + comment_lst.append("Job running.") + comment = " ".join(comment_lst) + return avail, comment + + +def df_apply_options(df, settings: Settings): + sortby = settings.sortby + filt: dict = settings.filt + filt_or: dict = settings.filt_or + groupby = settings.groupby + agg: Optional[Dict[str, List[str]]] = settings.agg + header: Optional[list] = settings.header + if type(df) == pd.DataFrame: + if df.empty: + pass + else: + pd_and_masks = [] + for col, value in filt.items(): + try: + # bool is a special case + if df[col].dtype.type == np.bool_: + value = convert_to_bool(value) + # we need to make the filter compareable via the column type. + tvalue = df[col].dtype.type(value) + pd_and_masks.append(df[col] == tvalue) + except KeyError: + log.warning(f"filt {col}={value} failed.") + if pd_and_masks: + df = df[np.logical_and.reduce(pd_and_masks)] + + pd_or_masks = [] + for col, value_lst in filt_or.items(): + for value in value_lst: + try: + # bool is a special case + if df[col].dtype.type == np.bool_: + value = convert_to_bool(value) + # we need to make the filter compareable via the column type. + try: + tvalue = df[col].dtype.type(value) + except ValueError: + tvalue = value + pd_or_masks.append(df[col] == tvalue) + except KeyError: + log.warning(f"filt_or {col}={value} failed.") + if pd_or_masks: + df = df[np.logical_or.reduce(pd_or_masks)] + if header: + df = pd.DataFrame(df, columns=header) + if groupby: + df = df.groupby(groupby) + if agg: + # if type(df) == DataFrameGroupBy and not df.ngroups: + # raise click.BadParameter(f"cannot operate an agg on an empty dataframe.") + if type(df) == DataFrameGroupBy and not df.ngroups: + df = pd.DataFrame() + else: + df = df.agg(agg) + df = df.reset_index() # this allows df.to_string(index=False) to work. + elif type(df) == DataFrameGroupBy: + pass + if sortby: + df = df.sort_values(by=[sortby]) + return df + + +def df_display(df: pd.DataFrame, settings): + if settings.format_json: + if df.empty: + print(pformat({})) + else: + print(pformat(df.to_dict())) + else: + if settings.bare and len(settings.header) == 1: + if df.empty: + print("", end="") + else: + col = settings.header[0] + print(settings.bare_delim.join(df[col].to_list()), end="") + elif settings.bare and len(settings.header) != 1: + raise click.BadParameter(f"bare must be selected with exactly one header and it probably should be vnode") + else: + if not df.empty: + df = df.fillna("-") + print(df.to_string(index=False)) + + +def get_avail(df_queues, df_jobs, df_vnodes, df_reservations, enum_lookup): + # remove all other queue types + df_queues = df_queues[df_queues["queue_type"] == "Execution"] # we don't need routing. + df_queues = pd.DataFrame(df_queues, columns=["queue", "queue_started"]) + df_jobs = pd.DataFrame(df_jobs, columns=["jobid", "exec_vnode"]) + df_jobs["exec_vnode_lst"] = df_jobs["exec_vnode"].apply(wrap_cast_to_vnode_list) + df_jobs.drop(["exec_vnode"], axis=1, inplace=True) + df_jobs = split_column_lst_into_series(df_jobs, "exec_vnode_lst", "vnode") + if not df_jobs.empty: + df_jobs = df_jobs[df_jobs["vnode"].notnull()] + df_vnodes = pd.DataFrame(df_vnodes, columns=["vnode", "node_state"]) + df_reservations = df_reservations.merge(df_queues, on=["queue"], how="left") + df_reservations = df_reservations[df_reservations["queue_started"] == 1] + df: pd.DataFrame = df_vnodes + if not df_jobs.empty: + df = df.merge(df_jobs, on=["vnode"], how="left") + if not df_reservations.empty: + df = df.merge(df_reservations, on=["vnode"], how="left") + df[["isavailable", "comment"]] = df.apply(logic_avail_nodes, axis=1, result_type="expand") + df[["isavailable_resv", "comment_resv"]] = df.apply(logic_avail_resv_nodes, axis=1, result_type="expand") + df = df.sort_values(by=["vnode"]) + df.drop(["resv_nodes", "reserve_state", "reserve_state_lst", "queue_started"], axis=1, inplace=True, errors="ignore") + df = df.reset_index() + return df + + +@cli.command() +@pass_settings +def queue_jobs_agg(settings: Settings): + agg = settings.agg + groupby = settings.groupby + filt = settings.filt + header = settings.header + valid_filters = {"job_state", "queue"} + if filt and not set(filt.keys()).issubset(valid_filters): + raise click.BadParameter(f"invalid filters provided: {set(filt.keys()) - valid_filters}. Valid:{valid_filters}") + + data = get_data(settings) + df_jobs = data.df_jobs + if not df_jobs.empty: + if "exec_vnode" in df_jobs.columns: + df_jobs["vnodes"] = df_jobs["exec_vnode"].apply(wrap_cast_to_vnode_count) + else: + df_jobs["vnodes"] = 0 + df_jobs["job_state"] = df_jobs["job_state"].apply(job_state_to_word) + for col, value in filt.items(): + df_jobs = df_jobs[df_jobs[col] == value] + dfts = df_jobs.groupby(["queue", "job_state"]).agg(count_jobs=("jobid", "count"), count_vnodes=("vnodes", "sum")) + df = dfts.reset_index() + else: + df = df_jobs + df = df_apply_options(df, settings) + df_display(df, settings) + + +@cli.command() +@pass_settings +def vnode(settings: Settings): + filt = settings.filt + + data = get_data(settings) + df_vnodes = data.df_vnodes + + valid_filters = set(df_vnodes.columns.to_list()) + if filt and not set(filt.keys()).issubset(valid_filters): + raise click.BadParameter(f"invalid filters provided: {set(filt.keys()) - valid_filters}. Valid:{valid_filters}") + df = df_vnodes + + df = df_apply_options(df, settings) + df_display(df, settings) + + +def find_subset(df_a, df_b, column): + """given two dataframes and a column that they share, find if a is a subset of b.""" + # FIXME: this cand be done much better in pandas. + alstofdct = df_to_lstofdct(df_a[[column]]) + blstofdct = df_to_lstofdct(df_b[[column]]) + lstofdct = [] + for adct in alstofdct: + for bdct in blstofdct: + abit = adct[column] + bbit = bdct[column] + subset = not (bbit & np.logical_not(abit)).any() + if subset: + adct["subset"] = True + else: + adct["subset"] = False + lstofdct.append(adct) + df = pd.DataFrame(lstofdct, columns=["subset"]) + return df["subset"] + + +@cli.command() +@click.option("--filt-in-or", help="--filt-in-or node_state_lst=down,offline,state-unknown,Stale", callback=cb_filt, multiple=True) +@pass_settings +def avail(settings: Settings, filt_in_or): + agg = settings.agg + groupby = settings.groupby + filt = settings.filt + valid_aggs = {"isavailable", "vnode", "isdown", "resvid"} + if agg and not set(agg.keys()).issubset(valid_aggs): + raise click.BadParameter(f"invalid agg provided: {set(agg.keys()) - valid_aggs}. Valid:{valid_aggs}") + valid_groupby = {"isavailable", "resvid"} + if groupby and not set(groupby).issubset(valid_groupby): + raise click.BadParameter(f"invalid groupby provided: {set(groupby) - valid_groupby}. Valid:{valid_groupby}") + valid_filters = {"isavailable", "resources_available.broken", "node_state", "queue", "isdown"} + if filt and not set(filt.keys()).issubset(valid_filters): + raise click.BadParameter(f"invalid filters provided: {set(filt.keys()) - valid_filters}. Valid:{valid_filters}") + + data = get_data(settings) + df_queues = data.df_queues + df_jobs = data.df_jobs + df_vnodes = data.df_vnodes + df_reservations = data.df_reservations + enum_lookup = data.enum_lookup + + df_queues = df_queues[df_queues["queue_type"] == "Execution"] # we don't need routing. + df_reservations = split_reservations_into_nodes(df_reservations, enum_lookup) + df = get_avail(df_queues, df_jobs, df_vnodes, df_reservations, enum_lookup) + # the node is available, now we need to get the queue information merged with the vnodes. + + if "queue" in filt: + # all nodes are not available to all queues. If we specify a queue, we have to look + # what is available to it. If no default chunks, it should be all. + if type(filt["queue"]) == list and len(filt["queue"]) > 1: + raise click.BadParameter("filt for avail must have only one queue specified.") + df_queues, df_vnodes = extract_queue_vnode(df_queues, df_vnodes) + df_queues = df_queues[df_queues["queue"] == filt["queue"]] + df_vnodes.drop(["jobs"], axis=1, inplace=True, errors="ignore") + df = df.merge(df_vnodes, on=["vnode"], how="left") + subset_mask = find_subset(df, df_queues, "resc_sel_bitmask") + df = df[subset_mask] + df = df.rename(columns={"node_state_y": "node_state"}) + df = pd.DataFrame( + df, + columns=[ + "vnode", + "node_state", + "jobid", + "resvid", + "queue", + "isavailable", + "comment", + "isavailable_resv", + "comment_resv", + ], + ) + df = df[df["queue"] == filt["queue"]] + del settings.filt["queue"] + + df = df.drop_duplicates() + df["node_state_lst"] = df["node_state"].apply(lambda s: s.split(",") if s and pd.notna(s) else []) + + def func_filt_in_or(filt_in_or, ser): + include = 0 + for key, filt_lst in filt_in_or.items(): + intersection = set(ser["node_state_lst"]).intersection(set(filt_lst)) + if intersection: + include = 1 + break + return include + + # fixme: special case: + if filt_in_or: + filt_in_or = dict([(k, v.split(",")) for k, v in filt_in_or.items()]) + df["isdown"] = df.apply(partial(func_filt_in_or, filt_in_or), axis=1) + df.drop(["node_state_lst"], axis=1, inplace=True, errors="ignore") + + df = df_apply_options(df, settings) + df_display(df, settings) + + +def func_bitmask(bitmask_imap, value_lst): + bitmask = np.zeros(64, dtype=np.int8) + for value in value_lst: + try: + bitmask[bitmask_imap[value]] = 1 + except: + pass + return bitmask + + +def func_resc_sel(queue_resc_cols, columns, sel_rename, ser): + lst = [] + for col in columns: + new_column = col.replace(sel_rename, f"{ser[col]}__") + if new_column in queue_resc_cols: + value = ser[col] + if pd.notna(value): + lst.append(new_column) + return lst + + +def extract_queue_vnode(df_queues, df_vnodes): + """Process default_chunks on the queue and resources_available on the nodes. Create a bitmask of each the resources + so that later they can be used for an intersection using the inhib logic. + for qdct in .. + for vdct in .. + qbit = qdct['resc_sel_bitmask'] + vbit = vdct['resc_sel_bitmask'] + subset = not (qbit & np.logical_not(vbit)).any() + """ + df_queues = df_queues.copy() + vnode_columns = set(filter(lambda c: c.startswith("resources_available."), df_vnodes.columns.to_list())) + queue_columns = set(filter(lambda c: c.startswith("default_chunk."), df_queues.columns.to_list())) + # queues_all = set(df_queues['queue'].unique()) # later we need this to find the queues that have no restrictions. + queue_resc_cols = set([col.replace("default_chunk.", f"{val}__") for col in queue_columns for val in [True, False]]) + vnode_resc_cols = set([col.replace("resources_available.", f"{val}__") for col in vnode_columns for val in [True, False]]) + assert queue_resc_cols.issubset( + vnode_resc_cols + ), f"a resource is defined in default_chunk that doesn't exist on a node. {queue_resc_cols - vnode_resc_cols}" + bitmask_imap = dict([(c, i) for i, c in enumerate(sorted(list(queue_resc_cols)), start=0)]) + # get the default_chunk.* columns into TRUE_value and FALSE_value or X_value + df_queues["resc_sel"] = df_queues.apply(partial(func_resc_sel, queue_resc_cols, queue_columns, "default_chunk."), axis=1) + df_queues["resc_sel_bitmask"] = df_queues["resc_sel"].apply(partial(func_bitmask, bitmask_imap)) + # build the bitmask on the fly right here. + df_vnodes["resc_sel"] = df_vnodes.apply(partial(func_resc_sel, queue_resc_cols, vnode_columns, "resources_available."), axis=1) + df_vnodes["resc_sel_bitmask"] = df_vnodes["resc_sel"].apply(partial(func_bitmask, bitmask_imap)) + return df_queues, df_vnodes + + +def split_reservations_into_nodes(df_reservations, enum_lookup): + """given the reservations dataframe, get the resv_nodes and create a row, one for each node.""" + df_reservations = pd.DataFrame(df_reservations, columns=["resvid", "reserve_state", "resv_nodes", "queue"]) + df_reservations["reserve_state_lst"] = df_reservations["reserve_state"].apply( + partial(state_to_strings, enum_lookup, "LONG__REVERSE_RESV_STATE") + ) + df_reservations["resv_nodes_lst"] = df_reservations["resv_nodes"].apply(wrap_cast_to_vnode_list) + df_reservations = split_column_lst_into_series(df_reservations, "resv_nodes_lst", "vnode") + df_reservations["reserve_state_lst"] = df_reservations["reserve_state_lst"].apply(lambda x: ",".join(x)) + df_reservations.drop( + [ + "resv_nodes", + "reserve_state", + ], + axis=1, + inplace=True, + errors="ignore", + ) + df_reservations = df_reservations.rename(columns={"reserve_state_lst": "reserve_state"}) + return df_reservations + + +@cli.command() +@pass_settings +def queue_avail(settings: Settings): + filt = settings.filt + valid_filters = {"queue", "resvid"} + if filt and not set(filt.keys()).issubset(valid_filters): + raise click.BadParameter(f"invalid filters provided: {set(filt.keys()) - valid_filters}. Valid:{valid_filters}") + + data = get_data(settings) + df_queues = data.df_queues + df_jobs = data.df_jobs + df_vnodes = data.df_vnodes + df_reservations = data.df_reservations + enum_lookup = data.enum_lookup + + if "resvid" in filt: + # take the data down a bit. + subfilt = {"resvid": filt["resvid"]} + df_reservations = df_reservations.loc[(df_reservations[list(subfilt)] == pd.Series(subfilt)).all(axis=1)] + + df_reservations = split_reservations_into_nodes(df_reservations, enum_lookup) + df_avail = get_avail(df_queues, df_jobs, df_vnodes, df_reservations, enum_lookup) + vnodes_all = set(df_vnodes["vnode"].unique()) + + df_queues, df_vnodes = extract_queue_vnode(df_queues, df_vnodes) + + qlstofdct = df_to_lstofdct(df_queues[["queue", "resc_sel", "resc_sel_bitmask"]]) + vlstofdct = df_to_lstofdct(df_vnodes[["vnode", "resc_sel", "resc_sel_bitmask", "node_state"]]) + queue_vnodes_agg_lookup = {} + available_vnodes = set(df_avail[df_avail["isavailable"] == 1]["vnode"].to_list()) + unavailable_vnodes = set(df_avail[df_avail["isavailable"] == 0]["vnode"].to_list()) + + # if queue has nodes, available can only come from it. + queues_with_nodes = defaultdict(set) + df_reservations = df_reservations[df_reservations["reserve_state"] == "RESV_STATE_RUNNING"] + + if df_reservations.empty: + rlstofdct = [] + else: + rlstofdct = df_to_lstofdct(df_reservations[["resvid", "vnode"]]) + for dct in rlstofdct: + queues_with_nodes[dct["resvid"]].add(dct["vnode"]) + + for qdct in qlstofdct: + queue = qdct["queue"] + if queue in queues_with_nodes: + available_vnodes_resv = set( + df_avail[((df_avail["isavailable_resv"] == 1) & (df_avail["queue"] == queue))]["vnode"].to_list() + ) + queue_avail_nodes = available_vnodes_resv + queue_possible_nodes = queues_with_nodes[queue] + else: + queue_avail_nodes = available_vnodes + queue_possible_nodes = vnodes_all + queue_vnodes_lookup = { + "possible": 0, + "available": 0, + "has_down": 0, + "has_offline": 0, + "has_free": 0, + "has_state-unknown": 0, + "has_Stale": 0, + "vnodes": set(), + } + for vdct in vlstofdct: + vnode = vdct["vnode"] + if vnode not in queue_possible_nodes: + continue + qbit = qdct["resc_sel_bitmask"] + vbit = vdct["resc_sel_bitmask"] + subset = not (qbit & np.logical_not(vbit)).any() + if subset: + queue_vnodes_lookup["vnodes"].add(vnode) + queue_vnodes_lookup["possible"] += 1 + if "down" in vdct["node_state"]: + queue_vnodes_lookup["has_down"] += 1 + if "offline" in vdct["node_state"]: + queue_vnodes_lookup["has_offline"] += 1 + if "free" in vdct["node_state"]: + queue_vnodes_lookup["has_free"] += 1 + if "state-unknown" in vdct["node_state"]: + queue_vnodes_lookup["has_state-unknown"] += 1 + if "Stale" in vdct["node_state"]: + queue_vnodes_lookup["has_Stale"] += 1 + queue_vnodes_available = queue_vnodes_lookup["vnodes"].intersection(queue_avail_nodes) + queue_vnodes_lookup["available"] = len(queue_vnodes_available) + queue_vnodes_agg_lookup[queue] = queue_vnodes_lookup + + lstofdct = [] + for queue, agg_dct in queue_vnodes_agg_lookup.items(): + new_agg_dct = dict((k, v) for k, v in agg_dct.items() if type(v) == int) + new_agg_dct["queue"] = queue + lstofdct.append(new_agg_dct) + df = pd.DataFrame(lstofdct) + + df = df_apply_options(df, settings) + df_display(df, settings) + + +@cli.command() +@pass_settings +def job(settings: Settings): + filt = settings.filt + data = get_data(settings) + df_jobs = data.df_jobs + if not settings.header: + settings.header = [ + "jobid", + "queue", + "job_state", + "substate", + "stime", + # 'Resource_List.nodect', + "nodect", + # 'Resource_List.walltime', 'resources_used.walltime', + "walltime", + "runtime", + "run_count", + "real_runtime", + "job_stuck", + "vnode", + ] + df_jobs["substate"] = df_jobs.get("substate", default=pd.Series(dtype=object)).apply( + lambda ser: "".join(state_to_strings(data.enum_lookup, "LONG__REVERSE_JOB_SUBSTATE", ser)).replace("JOB_SUBSTATE_", "") + ) + valid_filters = set(df_jobs.columns.to_list()) + if filt and not set(filt.keys()).issubset(valid_filters): + raise click.BadParameter(f"invalid filters provided: {set(filt.keys()) - valid_filters}. Valid:{valid_filters}") + if not df_jobs.empty: + df_jobs["job_state"] = df_jobs["job_state"].apply(job_state_to_word) + # this makes lots of rows. Maybe have a split option for this data. + # df_jobs['exec_vnode'] = df_jobs['exec_vnode'].apply(wrap_cast_to_vnode_list) + df_jobs["exec_vnode"] = df_jobs.get("exec_vnode", default=pd.Series(dtype=object)).apply(wrap_cast_to_vnode_list) + df_jobs = split_column_lst_into_series(df_jobs, "exec_vnode", "vnode") + df = df_jobs + df = df_apply_options(df, settings) + df_display(df, settings) + + +@cli.command() +@click.option("--option", help="special job query option --option stuck") +@pass_settings +def job_special(settings: Settings, option): + """return the jobs that are in R or E that are not progressing/stime is not changing and is 10 minute different than now.""" + valid_options = ["stuck"] + if option not in valid_options: + raise click.BadOptionUsage(f"Invalid option {option} selected. Valid:{','.join(valid_options)}") + + data = get_data(settings) + + if option == "stuck": + df_jobs = data.df_jobs + metadata = data.metadata + if not settings.header: + settings.header = [ + "jobid", + "queue", + "job_state", + "substate", + "stime", + # 'Resource_List.nodect', + "nodect", + # 'Resource_List.walltime', 'resources_used.walltime', + "walltime", + "runtime", + "run_count", + "real_runtime", + "job_stuck", + ] + + df = df_jobs + if not df.empty: + df["substate"] = df.get("substate", default=pd.Series(dtype=object)).apply( + lambda ser: "".join(state_to_strings(data.enum_lookup, "LONG__REVERSE_JOB_SUBSTATE", ser)).replace( + "JOB_SUBSTATE_", "" + ) + ) + timestamp = datetime.datetime.strptime(metadata["timestamp"], LOG_REGEX.LOG_FORMAT_ISO8601) + df["stime"] = df.get("stime", default=pd.Series(dtype=object)).apply( + lambda ts: epoch_to_datetime(ts) if pd.notna(ts) else np.nan + ) + df["resources_used.walltime_seconds"] = df.get("resources_used.walltime", pd.Series(dtype=object)).apply( + lambda ts: hms_to_seconds(ts) if ts else np.nan + ) + df["time_query"] = timestamp + + if not df.empty: + + def func_real_runtime(ser): + if "resources_used.walltime" in ser and pd.notna(ser["resources_used.walltime"]): + try: + result = (ser["time_query"] - ser["stime"]).total_seconds() + except: + result = np.nan + else: + result = np.nan + return result + + def func_stuck(ser): + epsilon = 10 * 60 + real_delta = abs(ser["resources_used.walltime_seconds"] - ser["real_runtime"]) + return 1 if real_delta > epsilon else 0 + + df["real_runtime"] = df.apply(func_real_runtime, axis=1) + df["job_stuck"] = df.apply(func_stuck, axis=1) + df = df.rename( + columns={ + "resources_used.walltime": "runtime", + "Resource_List.walltime": "walltime", + "Resource_List.nodect": "nodect", + } + ) + df = df[((df["job_state"].isin(["E", "R"]) & (df["job_stuck"] == 1)))] + + df = df_apply_options(df, settings) + df_display(df, settings) + + +if __name__ == "__main__": + cli() + +""" +# This is a bunch of commands to create some scenerios to test with. +pbs_jobid=$(/opt/pbs/bin/qsub -h -- /bin/sleep 1000);echo ${pbs_jobid}; qalter -o job_${pbs_jobid}.stdout -e job_${pbs_jobid}.stderr ${pbs_jobid}; qrls ${pbs_jobid}; +qmgr -c "set node pdw-c03 state=offline" +pbs_jobid2=$(/opt/pbs/bin/qsub -h -l select=4:ncpus=4,place=scatter -- /bin/sleep 1000);echo ${pbs_jobid2}; qalter -o job_${pbs_jobid2}.stdout -e job_${pbs_jobid2}.stderr ${pbs_jobid2}; qrls ${pbs_jobid2}; +pbs_resv_now=$(pbs_rsub -R $(date '+%H%M' -d "1 min") -D 01:00:00 -l select=1:ncpus=16|cut -d "." -f 1); echo $pbs_resv_now +pbs_resv_ltr=$(pbs_rsub -R $(date '+%H%M' -d "1 day") -D 01:00:00 -l select=1:ncpus=16|cut -d "." -f 1); echo $pbs_resv_ltr +qmgr -c "set server reserve_retry_time=60" +# stop pbs on pdw-c03, which is the reservation node of $pbs_resv_now +# the reservation will hop to somewhere + +pbs_rdel $pbs_resv_now +pbs_rdel $pbs_resv_ltr +qdel $pbs_jobid +qdel $pbs_jobid2 +qmgr -c "set node pdw-c03 state=free" + +pbs_resv_main=$(pbs_rsub -R $(date '+%H%M' -d "1 min") -D 01:00:00 --hosts pdw-c03|cut -d "." -f 1); echo $pbs_resv_main +""" diff --git a/README.md b/README.md index a776a061eed1aa99b0c4511c162aad2e3fa95457..67a64894a1ba966d167866009fcd9f4859356ac0 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,21 @@ +<!--- +Copyright (C) 2024, UChicago Argonne, LLC +Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +in the top-level directory. +---> + # PBS_Utils PBS_Utils is a collection of tools and libraries for administrating systems using the PBS Professional or OpenPBS scheduler. +## Authors and acknowledgment +Eric Pershey +Brian Toonen +George Rojas +Lisa Childers +Paul Rich + ## License / Copyright Please see [LICENSE.txt](LICENSE.txt). diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..b3490c2fabeb999ea5ca4f1182dcd08c48188338 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +[tool.autopep8] +max_line_length = 131 + +[tool.flake8] +max-line-length = 131 + +[tool.black] +line-length = 131 + +[tool.pylint] +max-line-length = 131 +disable = ["W0511"] + +[tool.pytest.ini_options] +pythonpath = [ + ".", +] +testpaths = [ + "tests", +] diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..945311c8cffe57fea1a87285500880ccb45e0870 --- /dev/null +++ b/setup.py @@ -0,0 +1,19 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +from setuptools import setup + +setup( + name="pbs_utils", + version="1.0.5", + description="pbs_utils", + author="Eric Pershey", + author_email="pershey@anl.gov", + packages=["PBS_Utils"], + zip_safe=False, + install_requires=[ + "python-dateutil", + "pytest>=7.1.2", + ], +) diff --git a/tests/data/pldata/20231003T150529_df_jobs.csv.gz b/tests/data/pldata/20231003T150529_df_jobs.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..b3839554da657e2dc6ad34ddb1a19c1089653434 Binary files /dev/null and b/tests/data/pldata/20231003T150529_df_jobs.csv.gz differ diff --git a/tests/data/pldata/20231003T150529_df_nodes.csv.gz b/tests/data/pldata/20231003T150529_df_nodes.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..29b0caaec622fe0b68dd6d75c56ade44c3f614f1 Binary files /dev/null and b/tests/data/pldata/20231003T150529_df_nodes.csv.gz differ diff --git a/tests/data/pldata/20231003T150529_df_queues.csv.gz b/tests/data/pldata/20231003T150529_df_queues.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..5f892bfea269229e2a74c73a06c9e6b7a06ffb4c Binary files /dev/null and b/tests/data/pldata/20231003T150529_df_queues.csv.gz differ diff --git a/tests/data/pldata/20231003T150529_df_reservations.csv.gz b/tests/data/pldata/20231003T150529_df_reservations.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..23293e8f1645f9e5d3b6531d44d0df102f7dd4ad Binary files /dev/null and b/tests/data/pldata/20231003T150529_df_reservations.csv.gz differ diff --git a/tests/data/pldata/20231003T150529_df_server.csv.gz b/tests/data/pldata/20231003T150529_df_server.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..d92b1df01780ee5ae2b6408cafbe7d78073897b6 Binary files /dev/null and b/tests/data/pldata/20231003T150529_df_server.csv.gz differ diff --git a/tests/data/pldata/20231003T150529_df_vnodes.csv.gz b/tests/data/pldata/20231003T150529_df_vnodes.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..40e6ebcfa0db02de21e52ceadb7633247f6981d3 Binary files /dev/null and b/tests/data/pldata/20231003T150529_df_vnodes.csv.gz differ diff --git a/tests/data/pldata/20231003T150529_metadata.json.gz b/tests/data/pldata/20231003T150529_metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..e796386f05991bc7035fe90fc4948ca3ee9dc05b Binary files /dev/null and b/tests/data/pldata/20231003T150529_metadata.json.gz differ diff --git a/tests/data/pldata/20231003T150529_pbs_enums.json.gz b/tests/data/pldata/20231003T150529_pbs_enums.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..06100d44b97dfeee5c089b8c9c6a3d463f0bca21 Binary files /dev/null and b/tests/data/pldata/20231003T150529_pbs_enums.json.gz differ diff --git a/tests/data/pldata/20231003T150529_pbsnodes.json.gz b/tests/data/pldata/20231003T150529_pbsnodes.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..e0ad625ab4ffd2bdaeedd16ba6cd00057badd31a Binary files /dev/null and b/tests/data/pldata/20231003T150529_pbsnodes.json.gz differ diff --git a/tests/data/pldata/20231003T150529_qstat.json.gz b/tests/data/pldata/20231003T150529_qstat.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..436e410abddba8e7f7e97d658d46d8bcf0790eeb Binary files /dev/null and b/tests/data/pldata/20231003T150529_qstat.json.gz differ diff --git a/tests/data/pldata/20231003T150639_df_jobs.csv.gz b/tests/data/pldata/20231003T150639_df_jobs.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..55c75590786b43da125045cb817969b5350f4c31 Binary files /dev/null and b/tests/data/pldata/20231003T150639_df_jobs.csv.gz differ diff --git a/tests/data/pldata/20231003T150639_df_nodes.csv.gz b/tests/data/pldata/20231003T150639_df_nodes.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..c1f7eadc3ce568ab1684bc056ee4023acfc02f59 Binary files /dev/null and b/tests/data/pldata/20231003T150639_df_nodes.csv.gz differ diff --git a/tests/data/pldata/20231003T150639_df_queues.csv.gz b/tests/data/pldata/20231003T150639_df_queues.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..34395bbb19ffb84f081f90c37f709f4e10097e53 Binary files /dev/null and b/tests/data/pldata/20231003T150639_df_queues.csv.gz differ diff --git a/tests/data/pldata/20231003T150639_df_reservations.csv.gz b/tests/data/pldata/20231003T150639_df_reservations.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..87220f4a44842cab7252da5bb63f519fe99d8d9e Binary files /dev/null and b/tests/data/pldata/20231003T150639_df_reservations.csv.gz differ diff --git a/tests/data/pldata/20231003T150639_df_server.csv.gz b/tests/data/pldata/20231003T150639_df_server.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..76433f1b6221bcccc1d28bd28565ff5de5b892e0 Binary files /dev/null and b/tests/data/pldata/20231003T150639_df_server.csv.gz differ diff --git a/tests/data/pldata/20231003T150639_df_vnodes.csv.gz b/tests/data/pldata/20231003T150639_df_vnodes.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..3a242f629d3e1bf85da0ccdcec244524c814e7b7 Binary files /dev/null and b/tests/data/pldata/20231003T150639_df_vnodes.csv.gz differ diff --git a/tests/data/pldata/20231003T150639_metadata.json.gz b/tests/data/pldata/20231003T150639_metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..72a76cfbdfc5d251ce6c291c666c1b2e5205b47e Binary files /dev/null and b/tests/data/pldata/20231003T150639_metadata.json.gz differ diff --git a/tests/data/pldata/20231003T150639_pbs_enums.json.gz b/tests/data/pldata/20231003T150639_pbs_enums.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..70151067353e050aa596cd745a83b68fc455c3d5 Binary files /dev/null and b/tests/data/pldata/20231003T150639_pbs_enums.json.gz differ diff --git a/tests/data/pldata/20231003T150639_pbsnodes.json.gz b/tests/data/pldata/20231003T150639_pbsnodes.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..0f747344e4e17c055e58fc5b1511e57118f92636 Binary files /dev/null and b/tests/data/pldata/20231003T150639_pbsnodes.json.gz differ diff --git a/tests/data/pldata/20231003T150639_qstat.json.gz b/tests/data/pldata/20231003T150639_qstat.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..441d4dfa2b23e6161d1e61b40e0815b961d9104b Binary files /dev/null and b/tests/data/pldata/20231003T150639_qstat.json.gz differ diff --git a/tests/data/pldata/20231003T151028_df_jobs.csv.gz b/tests/data/pldata/20231003T151028_df_jobs.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..06cd1846cac6627c007de05cd3d66c5000e98fdd Binary files /dev/null and b/tests/data/pldata/20231003T151028_df_jobs.csv.gz differ diff --git a/tests/data/pldata/20231003T151028_df_nodes.csv.gz b/tests/data/pldata/20231003T151028_df_nodes.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..debb417160fbedb79c47217752d75a27f91b53d4 Binary files /dev/null and b/tests/data/pldata/20231003T151028_df_nodes.csv.gz differ diff --git a/tests/data/pldata/20231003T151028_df_queues.csv.gz b/tests/data/pldata/20231003T151028_df_queues.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..244432ab052b62e793c37b61d0ce8776880c003c Binary files /dev/null and b/tests/data/pldata/20231003T151028_df_queues.csv.gz differ diff --git a/tests/data/pldata/20231003T151028_df_reservations.csv.gz b/tests/data/pldata/20231003T151028_df_reservations.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..3a03ff8bc98c85c9278bbc58e27b94cda29611dd Binary files /dev/null and b/tests/data/pldata/20231003T151028_df_reservations.csv.gz differ diff --git a/tests/data/pldata/20231003T151028_df_server.csv.gz b/tests/data/pldata/20231003T151028_df_server.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..77465c9604634f0f82f86de048a488f530f4933f Binary files /dev/null and b/tests/data/pldata/20231003T151028_df_server.csv.gz differ diff --git a/tests/data/pldata/20231003T151028_df_vnodes.csv.gz b/tests/data/pldata/20231003T151028_df_vnodes.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..6db3d12b41ae40a1024613b29650179c5f69563e Binary files /dev/null and b/tests/data/pldata/20231003T151028_df_vnodes.csv.gz differ diff --git a/tests/data/pldata/20231003T151028_metadata.json.gz b/tests/data/pldata/20231003T151028_metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..da160aac39a2c9342ff52479e7df332bce350c5c Binary files /dev/null and b/tests/data/pldata/20231003T151028_metadata.json.gz differ diff --git a/tests/data/pldata/20231003T151028_pbs_enums.json.gz b/tests/data/pldata/20231003T151028_pbs_enums.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..c764556cf9f378f627b0728cf615d8e81e9538d6 Binary files /dev/null and b/tests/data/pldata/20231003T151028_pbs_enums.json.gz differ diff --git a/tests/data/pldata/20231003T151028_pbsnodes.json.gz b/tests/data/pldata/20231003T151028_pbsnodes.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..31bcf0c16d4fd2fe6148f23e84d187a2f177b4bb Binary files /dev/null and b/tests/data/pldata/20231003T151028_pbsnodes.json.gz differ diff --git a/tests/data/pldata/20231003T151028_qstat.json.gz b/tests/data/pldata/20231003T151028_qstat.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..1e432fd52e5d71523250a53281571e1658f0653c Binary files /dev/null and b/tests/data/pldata/20231003T151028_qstat.json.gz differ diff --git a/tests/submit_script.sh b/tests/submit_script.sh new file mode 100755 index 0000000000000000000000000000000000000000..f25c1cd8837cc00b2dad6b2da78b0b0eac70892b --- /dev/null +++ b/tests/submit_script.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +count=0 +while true; do + echo "count:$count"; + sleep 0.5 + (( count++ )) + if [[ "$count" -gt 20 ]]; then + exit 1 + fi +done; + diff --git a/tests/test_pbs_accounting.py b/tests/test_pbs_accounting.py new file mode 100644 index 0000000000000000000000000000000000000000..2adeadb297621c57fad77ca5020cd20bb915070b --- /dev/null +++ b/tests/test_pbs_accounting.py @@ -0,0 +1,2797 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +from typing import Tuple, List + +import pytest +from datetime import datetime +from textwrap import dedent +from pprint import pprint, pformat + +from dateutil.parser import parse as date_parse + +from PBS_Utils.pbs_accounting import ( + BadLineError, + get_time, + format_key_value_pairs, + create_pbs_log_line, + seconds_to_hms, + enhance_pbs_record, + Parse_Error_PBS, + FieldHandlers, +) +from PBS_Utils.pbs_accounting import LOG_REGEX, RecType +from PBS_Utils.pbs_accounting import fix_newlines_extract_time +from PBS_Utils.pbs_accounting import hms_to_seconds +from PBS_Utils.pbs_accounting import parse_key_value_pairs +from PBS_Utils.pbs_accounting import parse_pbs_log_line +from PBS_Utils.pbs_accounting import pbs_line_extract_time +from PBS_Utils.pbs_accounting import split_record, func_squote, func_dquote +from PBS_Utils.pbs_accounting import cast_to_place_dict, join_place_dict +from PBS_Utils.pbs_accounting import cast_to_select_list, join_select_list +from PBS_Utils.pbs_accounting import create_pbs_log_line_from_record, create_pbs_record + + +def test_LOG_REGEX(): + assert LOG_REGEX.LOG_FORMAT_PBS + assert LOG_REGEX.LOG_FORMAT_PBS_GROUPED + assert LOG_REGEX.LOG_FORMAT_DATA_SEPERATOR_PBS == ";" + assert LOG_REGEX.PATTERN_KEY_VALUE_PAIRS + + +def test_RecType(): + """this tests to make sure the record types have associated entries in the RecType.dct dictionary.""" + for key, value in RecType.dct.items(): + if key is None: + continue + assert getattr(RecType, key, None) is not None + assert value["record"] == key + + missing = [] + for key, value in RecType.__dict__.items(): + if (not key.startswith("_")) and (type(value) == str): + try: + RecType.dct[key] + except KeyError: + missing.append(key) + assert len(missing) == 0, pformat(missing) + + +def input_pbs_line_extract_time(ids=False) -> List[Tuple[str, Tuple[datetime, str]]]: + deck = [ + # vvv line vvv ( correct log time correct line) + ( + "03/24/2011 13:50:42;D;362971;requester=someuser99", + (date_parse("2011-03-24 13:50:42"), "D;362971;requester=someuser99"), + ), + ( + "11/18/2014 17:29:02;Q;248;Resource_List.ncpus=512 Resource_List.nodect=512 Resource_List.walltime=0:10:00 " + "account=someaccount32 queue=default resource=default user=userd", + ( + date_parse("2014-11-18 17:29:02"), + "Q;248;Resource_List.ncpus=512 Resource_List.nodect=512 Resource_List.walltime=0:10:00 account=someaccount32 queue=default " + "resource=default user=userd", + ), + ), + ( + "11/18/2014 17:29:16;D;248;Resource_List.ncpus=512 Resource_List.nodect=512 Resource_List.walltime=0:10:00 " + "requester=userd resource=default user=userd", + ( + date_parse("2014-11-18 17:29:16"), + "D;248;Resource_List.ncpus=512 Resource_List.nodect=512 Resource_List.walltime=0:10:00 requester=userd " + "resource=default user=userd", + ), + ), + ( + "11/18/2014 17:29:49;Q;249;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 " + "account=someaccount32 queue=default resource=default user=userd", + ( + date_parse("2014-11-18 17:29:49"), + "Q;249;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 account=someaccount32 " + "queue=default resource=default user=userd", + ), + ), + ( + "11/18/2014 17:30:13;Q;249;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 " + "account=someaccount32 queue=default resource=default user=userd", + ( + date_parse("2014-11-18 17:30:13"), + "Q;249;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 account=someaccount32 " + "queue=default resource=default user=userd", + ), + ), + ( + "11/18/2014 17:39:26;S;243;Resource_List.ncpus=512 Resource_List.nodect=512 Resource_List.walltime=0:10:00 " + "account=someaccount32 args= ctime=1416269680.72 cwd=/fs/somedirectoryobj43 etime=1416269680.72 exe=/bin/date " + "exec_host=IRN-48400-7B731-512 group=unknown jobname=N/A mode=c1 qtime=1416269680.72 queue=default resource=default " + "session=unknown start=1416353966.65 user=userd", + ( + date_parse("2014-11-18 17:39:26"), + "S;243;Resource_List.ncpus=512 Resource_List.nodect=512 Resource_List.walltime=0:10:00 account=someaccount32 args= " + "ctime=1416269680.72 cwd=/fs/somedirectoryobj43 etime=1416269680.72 exe=/bin/date exec_host=IRN-48400-7B731-512 " + "group=unknown jobname=N/A mode=c1 qtime=1416269680.72 queue=default resource=default session=unknown " + "start=1416353966.65 user=userd", + ), + ), + ( + "11/18/2014 17:39:44;S;249;Resource_List.ncpus=65536 Resource_List.nodect=1024 Resource_List.walltime=0:10:00 " + "account=someaccount32 args= ctime=1416353389.79 cwd=/fs/somedirectoryobj43 etime=1416353425.79 exe=/bin/date " + "exec_host=IRN-48480-7B7F1-1-1024 group=unknown jobname=N/A mode=c64 qtime=1416353413.72 queue=default " + "resource=default session=unknown start=1416353984.31 user=userd", + ( + date_parse("2014-11-18 17:39:44"), + "S;249;Resource_List.ncpus=65536 Resource_List.nodect=1024 Resource_List.walltime=0:10:00 account=someaccount32 args= " + "ctime=1416353389.79 cwd=/fs/somedirectoryobj43 etime=1416353425.79 exe=/bin/date exec_host=IRN-48480-7B7F1-1-1024 " + "group=unknown jobname=N/A mode=c64 qtime=1416353413.72 queue=default resource=default session=unknown " + "start=1416353984.31 user=userd", + ), + ), + ( + "11/18/2014 17:40:02;Q;250;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:05:00 " + "account=someaccount32 queue=default resource=default user=userd", + ( + date_parse("2014-11-18 17:40:02"), + "Q;250;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:05:00 account=someaccount32 " + "queue=default resource=default user=userd", + ), + ), + ( + "11/18/2014 17:40:30;S;250;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:05:00 " + "account=someaccount32 args=--timeout,5000000 ctime=1416354002.41 cwd=/fs/somedirectoryobj43 etime=1416354016.26 exe=/bin/date " + "exec_host=IRN-48440-7B771-512 group=unknown jobname=N/A mode=c64 qtime=1416354002.41 queue=default resource=default " + "session=unknown start=1416354030.13 user=userd", + ( + date_parse("2014-11-18 17:40:30"), + "S;250;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:05:00 account=someaccount32 " + "args=--timeout,5000000 ctime=1416354002.41 cwd=/fs/somedirectoryobj43 etime=1416354016.26 exe=/bin/date " + "exec_host=IRN-48440-7B771-512 group=unknown jobname=N/A mode=c64 qtime=1416354002.41 queue=default " + "resource=default session=unknown start=1416354030.13 user=userd", + ), + ), + ( + "11/18/2014 17:41:33;E;243;Exit_status=0 Resource_List.ncpus=512 Resource_List.nodect=512 " + "Resource_List.walltime=0:10:00 account=someaccount32 args= ctime=1416269680.72 cwd=/fs/somedirectoryobj43 end=1416354093.12 " + "etime=1416269680.72 exe=/bin/date exec_host=IRN-48400-7B731-512 group=unknown jobname=N/A mode=c1 " + "qtime=1416269680.72 queue=default resource=default resources_used.location=IRN-48400-7B731-512 " + "resources_used.nodect=512 resources_used.walltime=0:02:06 session=unknown start=1416353966.65 user=userd", + ( + date_parse("2014-11-18 17:41:33"), + "E;243;Exit_status=0 Resource_List.ncpus=512 Resource_List.nodect=512 Resource_List.walltime=0:10:00 account=someaccount32 " + "args= ctime=1416269680.72 cwd=/fs/somedirectoryobj43 end=1416354093.12 etime=1416269680.72 exe=/bin/date " + "exec_host=IRN-48400-7B731-512 group=unknown jobname=N/A mode=c1 qtime=1416269680.72 queue=default " + "resource=default resources_used.location=IRN-48400-7B731-512 resources_used.nodect=512 " + "resources_used.walltime=0:02:06 session=unknown start=1416353966.65 user=userd", + ), + ), + ( + "11/18/2014 17:42:14;E;249;Exit_status=0 Resource_List.ncpus=65536 Resource_List.nodect=1024 " + "Resource_List.walltime=0:10:00 account=someaccount32 args= ctime=1416353389.79 cwd=/fs/somedirectoryobj43 end=1416354134.03 " + "etime=1416353425.79 exe=/bin/date exec_host=IRN-48480-7B7F1-1-1024 group=unknown jobname=N/A mode=c64 " + "qtime=1416353413.72 queue=default resource=default resources_used.location=IRN-48480-7B7F1-1-1024 " + "resources_used.nodect=1024 resources_used.walltime=0:02:29 session=unknown start=1416353984.31 user=userd", + ( + date_parse("2014-11-18 17:42:14"), + "E;249;Exit_status=0 Resource_List.ncpus=65536 Resource_List.nodect=1024 Resource_List.walltime=0:10:00 " + "account=someaccount32 args= ctime=1416353389.79 cwd=/fs/somedirectoryobj43 end=1416354134.03 etime=1416353425.79 exe=/bin/date " + "exec_host=IRN-48480-7B7F1-1-1024 group=unknown jobname=N/A mode=c64 qtime=1416353413.72 queue=default " + "resource=default resources_used.location=IRN-48480-7B7F1-1-1024 resources_used.nodect=1024 " + "resources_used.walltime=0:02:29 session=unknown start=1416353984.31 user=userd", + ), + ), + ( + "11/18/2014 17:45:38;A;250;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:05:00 " + "resource=default user=userd", + ( + date_parse("2014-11-18 17:45:38"), + "A;250;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:05:00 resource=default " + "user=userd", + ), + ), + ( + "11/18/2014 17:45:58;E;250;Exit_status=143 Resource_List.ncpus=32768 Resource_List.nodect=512 " + "Resource_List.walltime=0:05:00 account=someaccount32 args=--timeout,5000000 ctime=1416354002.41 cwd=/fs/somedirectoryobj43 " + "end=1416354358.67 etime=1416354016.26 exe=/bin/date exec_host=IRN-48440-7B771-512 group=unknown jobname=N/A mode=c64 " + "qtime=1416354002.41 queue=default resource=default resources_used.location=IRN-48440-7B771-512 " + "resources_used.nodect=512 resources_used.walltime=0:05:28 session=unknown start=1416354030.13 user=userd", + ( + date_parse("2014-11-18 17:45:58"), + "E;250;Exit_status=143 Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:05:00 " + "account=someaccount32 args=--timeout,5000000 ctime=1416354002.41 cwd=/fs/somedirectoryobj43 end=1416354358.67 " + "etime=1416354016.26 exe=/bin/date exec_host=IRN-48440-7B771-512 group=unknown jobname=N/A mode=c64 " + "qtime=1416354002.41 queue=default resource=default resources_used.location=IRN-48440-7B771-512 " + "resources_used.nodect=512 resources_used.walltime=0:05:28 session=unknown start=1416354030.13 user=userd", + ), + ), + ( + "11/18/2014 17:46:12;Q;251;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 " + "account=someaccount32 queue=default resource=default user=userd", + ( + date_parse("2014-11-18 17:46:12"), + "Q;251;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 account=someaccount32 " + "queue=default resource=default user=userd", + ), + ), + ( + "11/18/2014 17:46:27;D;251;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 " + "requester=userd resource=default user=userd", + ( + date_parse("2014-11-18 17:46:27"), + "D;251;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 requester=userd " + "resource=default user=userd", + ), + ), + ( + "11/18/2014 17:46:35;Q;252;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 " + "account=someaccount32 queue=default resource=default user=userd", + ( + date_parse("2014-11-18 17:46:35"), + "Q;252;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 account=someaccount32 " + "queue=default resource=default user=userd", + ), + ), + ( + "11/18/2014 17:46:52;S;252;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 " + "account=someaccount32 args= ctime=1416354395.79 cwd=/fs/somedirectoryobj43 etime=1416354395.79 exe=/bin/date " + "exec_host=IRN-48840-7BB71-512 group=unknown jobname=N/A mode=c64 qtime=1416354395.79 queue=default " + "resource=default session=unknown start=1416354412.88 user=userd", + ( + date_parse("2014-11-18 17:46:52"), + "S;252;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 account=someaccount32 args= " + "ctime=1416354395.79 cwd=/fs/somedirectoryobj43 etime=1416354395.79 exe=/bin/date exec_host=IRN-48840-7BB71-512 " + "group=unknown jobname=N/A mode=c64 qtime=1416354395.79 queue=default resource=default session=unknown " + "start=1416354412.88 user=userd", + ), + ), + ( + "11/18/2014 17:47:26;D;252;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 " + "requester=userd resource=default user=userd", + ( + date_parse("2014-11-18 17:47:26"), + "D;252;Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 requester=userd " + "resource=default user=userd", + ), + ), + ( + "11/18/2014 17:47:43;E;252;Exit_status=143 Resource_List.ncpus=32768 Resource_List.nodect=512 " + "Resource_List.walltime=0:10:00 account=someaccount32 args= ctime=1416354395.79 cwd=/fs/somedirectoryobj43 end=1416354463.84 " + "etime=1416354395.79 exe=/bin/date exec_host=IRN-48840-7BB71-512 group=unknown jobname=N/A mode=c64 " + "qtime=1416354395.79 queue=default resource=default resources_used.location=IRN-48840-7BB71-512 " + "resources_used.nodect=512 resources_used.walltime=0:00:50 session=unknown start=1416354412.88 user=userd", + ( + date_parse("2014-11-18 17:47:43"), + "E;252;Exit_status=143 Resource_List.ncpus=32768 Resource_List.nodect=512 Resource_List.walltime=0:10:00 " + "account=someaccount32 args= ctime=1416354395.79 cwd=/fs/somedirectoryobj43 end=1416354463.84 etime=1416354395.79 exe=/bin/date " + "exec_host=IRN-48840-7BB71-512 group=unknown jobname=N/A mode=c64 qtime=1416354395.79 queue=default " + "resource=default resources_used.location=IRN-48840-7BB71-512 resources_used.nodect=512 " + "resources_used.walltime=0:00:50 session=unknown start=1416354412.88 user=userd", + ), + ), + ] + if ids: + deck = [f"{i:0>2}" for i, _ in enumerate(deck)] + return deck + + +def input_parse_kv_pairs(ids=False): + deck = [ + ( + "job record", + "01/01/1970 01:01:01;Q;77777777;Resource_List.ncpus=8192 Resource_List.nodect=8192 Resource_List.walltime=1:16:00 " + "args= ctime=1295298682.99 cwd=/fs/somedirectoryobjB etime=1295298682.99 " + "exe=/fs/somedirectoryobjC exec_host=ANL-R30-R37-8192 group=unknown jobname=N/A " + "mode=co qtime=1295298682.99 queue=default session=unknown start=1295298688.17 user=someuser99", + { + "exec_host": "ANL-R30-R37-8192", + "exe": "/fs/somedirectoryobjC", + "group": "unknown", + "ctime": "1295298682.99", + "Resource_List.ncpus": "8192", + "qtime": "1295298682.99", + "Resource_List.walltime": "1:16:00", + "etime": "1295298682.99", + "queue": "default", + "start": "1295298688.17", + "session": "unknown", + "mode": "co", + "Resource_List.nodect": "8192", + "jobname": "N/A", + "cwd": "/fs/somedirectoryobjB", + "user": "someuser99", + "args": "", + }, + True, + ), + ( + "job record", + "01/01/1970 01:01:01;Q;77777777;Resource_List.ncpus=8192 Resource_List.nodect=8192 Resource_List.walltime=1:16:00 " + "args= ctime=1295298682.99 cwd=/fs/somedirectoryobjB etime=1295298682.99 " + "exe=/fs/somedirectoryobjC exec_host=ANL-R30-R37-8192 group=unknown jobname=N/A " + "mode=co qtime=1295298682.99 queue=default session=unknown start=1295298688.17 user=someuser99", + { + "exec_host": "ANL-R30-R37-8192", + "exe": "/fs/somedirectoryobjC", + "group": "unknown", + "ctime": "1295298682.99", + "Resource_List.ncpus": "8192", + "qtime": "1295298682.99", + "Resource_List.walltime": "1:16:00", + "etime": "1295298682.99", + "queue": "default", + "start": "1295298688.17", + "session": "unknown", + "mode": "co", + "Resource_List.nodect": "8192", + "jobname": "N/A", + "cwd": "/fs/somedirectoryobjB", + "user": "someuser99", + "args": "", + }, + True, + ), + ( + "", + "01/01/1970 01:01:01;Q;77777777;Resource_List.ncpus=8192 Resource_List.nodect=8192 Resource_List.walltime=1:16:00 " + "args= ctime=1295298682.99 " + "cwd=/fs/somedirectoryobjB etime=1295298682.99 " + "exe=/fs/somedirectoryobjC exec_host=ANL-R30-R37-8192 group=unknown jobname=N/A " + 'mode=co qtime=1295298682.99 queue="default QUE" session=unknown start=1295298688.17 user=someuser99', + { + "exec_host": "ANL-R30-R37-8192", + "exe": "/fs/somedirectoryobjC", + "group": "unknown", + "ctime": "1295298682.99", + "Resource_List.ncpus": "8192", + "qtime": "1295298682.99", + "Resource_List.walltime": "1:16:00", + "etime": "1295298682.99", + "queue": "default QUE", + "start": "1295298688.17", + "session": "unknown", + "mode": "co", + "Resource_List.nodect": "8192", + "jobname": "N/A", + "cwd": "/fs/somedirectoryobjB", + "user": "someuser99", + "args": "", + }, + True, + ), + ( + "job record", + "01/01/1970 01:01:01;Q;77777777;Resource_List.ncpus=256 Resource_List.nodect=256 Resource_List.walltime=12:00:00 " + "account=esp " + "args=/fs/somedirectoryobj8,9,ADBC ctime=1358307943.61 " + "cwd=/fs/somedirectoryobj9 etime=1358349957.24 " + "exe=/fs/somedirectoryobjA exec_host=MIR-00CC0-33FF1-512 " + "group=unknown jobname=somejobname mode=script qtime=1358307962.54 queue=somequeue78 session=unknown " + "start=1358349989.3 user=someuser1", + { + "Resource_List.ncpus": "256", + "Resource_List.nodect": "256", + "Resource_List.walltime": "12:00:00", + "account": "esp", + "args": "/fs/somedirectoryobj8,9,ADBC", + "ctime": "1358307943.61", + "cwd": "/fs/somedirectoryobj9", + "etime": "1358349957.24", + "exe": "/fs/somedirectoryobjA", + "exec_host": "MIR-00CC0-33FF1-512", + "group": "unknown", + "jobname": "somejobname", + "mode": "script", + "qtime": "1358307962.54", + "queue": "somequeue78", + "session": "unknown", + "start": "1358349989.3", + "user": "someuser1", + }, + True, + ), + ( + "simple kv pairs", + "01/01/1970 01:01:01;Q;77777777;key_a=pqr args=abc def ghi key_b=jkl key_c=mno", + { + "key_a": "pqr", + "key_b": "jkl", + "key_c": "mno", + "args": "abc def ghi", + }, + True, + ), + ( + "embedded structures", + '01/01/1970 01:01:01;Q;77777777;key_a=pqr args=abc=234def:asdfghi:{"zxcv":asdf} key_b=jkl key_c=mno', + { + "key_a": "pqr", + "key_b": "jkl", + "key_c": "mno", + "args": 'abc=234def:asdfghi:{"zxcv":asdf}', + }, + True, + ), + ( + "unicode characters embedded", + r"01/01/1970 01:01:01;Q;77777777;" + r"Resource_List.ncpus=961 Resource_List.nodect=961 Resource_List.walltime=1:00:00 account=Performance args=[" + r"u'\u2013i', 'i_lsms', u'\u2013mode', '1d', u'\u2013size_lsms', '1024', u'\u2013num_lsms', '30', " + r"u'\u2013num_steps', '600'] ctime=1380743728.06 " + r"cwd=/fs/somedirectoryobjD etime=1380743728.06 " + r"exe=/fs/somedirectoryobj92 exec_host=MIR-48880-7BBF1-1-1024 group=unknown " + r"jobname=output mode=c1 qtime=1380743728.06 queue=somequeue09 session=unknown start=1380749287.08 user=someuser2", + { + "Resource_List.ncpus": "961", + "Resource_List.nodect": "961", + "Resource_List.walltime": "1:00:00", + "account": "Performance", + "args": "[u'\\u2013i', 'i_lsms', u'\\u2013mode', '1d', u'\\u2013size_lsms', '1024', u'\\u2013num_lsms', " + "'30', u'\\u2013num_steps', '600']", + "ctime": "1380743728.06", + "cwd": "/fs/somedirectoryobjD", + "etime": "1380743728.06", + "exe": "/fs/somedirectoryobj92", + "exec_host": "MIR-48880-7BBF1-1-1024", + "group": "unknown", + "jobname": "output", + "mode": "c1", + "qtime": "1380743728.06", + "queue": "somequeue09", + "session": "unknown", + "start": "1380749287.08", + "user": "someuser2", + }, + False, + ), # BADDATA note this is false due to the embedded quotes and the lack of the ability to handle them. + ( + """simple k/v pairs with spaces""", + """01/01/1970 01:01:01;Q;77777777;Exit_status=2 tyu=8 abc=123 789 oiu=645""", + { + "Exit_status": "2", + "tyu": "8", + "abc": "123 789", + "oiu": "645", + }, + True, + ), + ( + """k/v pair with double quotes""", + """01/01/1970 01:01:01;Q;77777777;Exit_status=2 tyu=8 abc="123 789" oiu=645""", + { + "Exit_status": "2", + "tyu": "8", + "abc": "123 789", + "oiu": "645", + }, + True, + ), + ( + """k/v pair with single quotes""", + """01/01/1970 01:01:01;Q;77777777;Exit_status=2 tyu=8 abc='123 789' oiu=645""", + { + "Exit_status": "2", + "tyu": "8", + "abc": "123 789", + "oiu": "645", + }, + True, + ), + ( + """k/v pair with single quote in double quotes""", + """01/01/1970 01:01:01;Q;77777777;Exit_status=2 tyu=8 abc="123'789" oiu=645""", + { + "Exit_status": "2", + "tyu": "8", + "abc": "123'789", + "oiu": "645", + }, + False, + ), # BADDATA note this is false due to the embedded quotes and the lack of the ability to handle them. + ( + """k/v pair with double quote in single quotes""", + """01/01/1970 01:01:01;Q;77777777;Exit_status=2 tyu=8 abc='123"789' oiu=645""", + BadLineError, + # removed the single quoting around the right side, 20231025, Eric + # { + # 'Exit_status': '2', + # 'tyu': '8', + # 'abc': '123"789', + # 'oiu': '645', + # }, + False, + ), # BADDATA note this is false due to the embedded quotes and the lack of the ability to handle them. + ( + """k/v pair with escaped double quote in double quotes""", + """01/01/1970 01:01:01;Q;77777777;Exit_status=2 tyu=8 abc="123\\"789" oiu=645""", + { + "Exit_status": "2", + "tyu": "8", + "abc": '123"789', + "oiu": "645", + }, + False, + ), # BADDATA note this is false due to the embedded quotes and the lack of the ability to handle them. + ( + """k/v pair with escaped single quote in single quotes""", + """01/01/1970 01:01:01;Q;77777777;Exit_status=2 tyu=8 abc='123\\'789' oiu=645""", + { + "Exit_status": "2", + "tyu": "8", + "abc": "123'789", + "oiu": "645", + }, + False, + ), # BADDATA note this is false due to the embedded quotes and the lack of the ability to handle them. + ( + """k/v pairs in k/v pair""", + """01/01/1970 01:01:01;Q;77777777;Exit_status=2 abc="qwe=123 tyu=456 ljk='789'" oiu=645""", + { + "Exit_status": "2", + "abc": "qwe=123 tyu=456 ljk='789'", + "oiu": "645", + }, + False, + ), # BADDATA note this is false due to the embedded quotes and the lack of the ability to handle them. + ( + """k/v pairs in k/v pair with duplicate tyu and escaped quotes""", + """01/01/1970 01:01:01;Q;77777777;Exit_status=2 tyu=987 abc="qwe=\"123\" tyu=456 ljk='789'" oiu=645""", + { + "Exit_status": "2", + "tyu": "987", + "abc": "qwe=\"123\" tyu=456 ljk='789'", + "oiu": "645", + }, + False, + ), # BADDATA note this is false due to the embedded quotes and the lack of the ability to handle them. + ( + """k/v pairs in k/v pair with inner k/v pairs values quoted""", + """01/01/1970 01:01:01;Q;77777777;Exit_status=2 tyu=987 abc=qwe="123 759" someuser3 "tyu=456" 'ljk=789' bar oiu=645""", + { + "Exit_status": "2", + "tyu": "987", + "abc": 'qwe="123 759" someuser3 "tyu=456" \'ljk=789\' bar', + "oiu": "645", + }, + True, + ), + ( + """really complex k/v pairs""", + "01/01/1970 01:01:01;Q;77777777;" + "Exit_status=2 Resource_List.ncpus=1 Resource_List.nodect=1 Resource_List.walltime=0:30:00 account=Foo " + "args=-c,\"env WORLD='hello' PATTERN='^$' make -C /fs/somedirectoryobj97 " + "EPREFIX=/fs/somedirectoryobj90 NODES=1 MAP_BY=node MPIRUN_ARGS='' " + 'logdir/my-prog-7-map-by-node.log" ctime=1603595566.28 cwd=/fs/somedirectoryobj96 ' + "end=1603595710.77 etime=1603595566.28 exe=/fs/somedirectoryobj98 exec_host=3834 group=unknown " + "jobname=my.job mode=script qtime=1603595566.28 queue=somequeue1 resource=somemachine3 " + "resources_used.location=3834 resources_used.nodect=1 resources_used.walltime=0:01:46 session=unknown " + "start=1603595604.7 user=someuser3 ", + { + "Exit_status": "2", + "Resource_List.ncpus": "1", + "Resource_List.nodect": "1", + "Resource_List.walltime": "0:30:00", + "account": "Foo", + "args": "-c,\"env \tWORLD='hello' PATTERN='^$' \tmake -C /fs/somedirectoryobj97 " + "EPREFIX=/fs/somedirectoryobj90 \tNODES=1 MAP_BY=node \tMPIRUN_ARGS='' " + '\tlogdir/my-prog-7-map-by-node.log"', + "ctime": "1603595566.28", + "cwd": "/fs/somedirectoryobj96", + "end": "1603595710.77", + "etime": "1603595566.28", + "exe": "/fs/somedirectoryobj98", + "exec_host": "3834", + "group": "unknown", + "jobname": "my.job", + "mode": "script", + "qtime": "1603595566.28", + "queue": "somequeue1", + "resource": "somemachine3", + "resources_used.location": "3834", + "resources_used.nodect": "1", + "resources_used.walltime": "0:01:46", + "session": "unknown", + "start": "1603595604.7", + "user": "someuser3", + }, + True, + ), + ( + """k/v pairs with missing quote""", + """01/01/1970 01:01:01;Q;77777777;Exit_status=2 tyu=987 abc='qwe=123 759 tyu=456 ljk=789' oiu='645""", + BadLineError, + True, + ), + ( + "duplicate Exit_status", + "05/19/2022 21:33:28;R;244329.somehost;user=someuser432 group=Group project=someuser72 " + "jobname=rfm_job queue=somequeue32 ctime=1652994845 qtime=1652994845 etime=1652994845 start=1652995653 " + "exec_host=x3104c0s19b1n0/0*64+x3104c0s1b0n0/0*64+x3104c0s1b1n0/0*64+x3104c0s25b0n0/0*64+x3104c0s25b1n0/0*64" + "+x3104c0s31b0n0/0*64+x3104c0s31b1n0/0*64+x3104c0s37b0n0/0*64+x3104c0s37b1n0/0*64+x3104c0s7b0n0/0*64+x3105c0s13b0n0/0" + "*64+x3105c0s13b1n0/0*64+x3105c0s19b0n0/0*64+x3105c0s19b1n0/0*64+x3105c0s1b0n0/0*64+x3105c0s1b1n0/0*64+x3105c0s25b0n0" + "/0*64+x3105c0s25b1n0/0*64+x3105c0s31b0n0/0*64+x3105c0s31b1n0/0*64+x3105c0s37b0n0/0*64+x3105c0s37b1n0/0*64" + "+x3105c0s7b0n0/0*64+x3105c0s7b1n0/0*64+x3106c0s13b0n0/0*64+x3106c0s13b1n0/0*64+x3106c0s19b0n0/0*64+x3106c0s19b1n0/0" + "*64+x3106c0s1b0n0/0*64+x3106c0s1b1n0/0*64+x3106c0s25b0n0/0*64+x3106c0s25b1n0/0*64+x3106c0s31b0n0/0*64+x3106c0s31b1n0" + "/0*64+x3106c0s37b0n0/0*64+x3106c0s37b1n0/0*64+x3106c0s7b0n0/0*64+x3106c0s7b1n0/0*64+x3107c0s13b0n0/0*64" + "+x3107c0s13b1n0/0*64+x3107c0s19b0n0/0*64+x3107c0s19b1n0/0*64+x3107c0s25b0n0/0*64+x3107c0s25b1n0/0*64+x3107c0s31b0n0" + "/0*64+x3107c0s31b1n0/0*64+x3107c0s37b0n0/0*64+x3107c0s37b1n0/0*64+x3107c0s7b0n0/0*64+x3107c0s7b1n0/0*64" + "+x3108c0s13b0n0/0*64+x3108c0s13b1n0/0*64+x3108c0s19b0n0/0*64+x3108c0s19b1n0/0*64+x3108c0s1b0n0/0*64+x3108c0s1b1n0/0" + "*64+x3108c0s25b0n0/0*64+x3108c0s25b1n0/0*64+x3108c0s31b0n0/0*64+x3108c0s31b1n0/0*64+x3108c0s37b0n0/0*64" + "+x3108c0s37b1n0/0*64+x3208c0s37b1n0/0*64+x3208c0s7b0n0/0*64 exec_vnode=(x3104c0s19b1n0:ncpus=64)+(" + "x3104c0s1b0n0:ncpus=64)+(x3104c0s1b1n0:ncpus=64)+(x3104c0s25b0n0:ncpus=64)+(x3104c0s25b1n0:ncpus=64)+(" + "x3104c0s31b0n0:ncpus=64)+(x3104c0s31b1n0:ncpus=64)+(x3104c0s37b0n0:ncpus=64)+(x3104c0s37b1n0:ncpus=64)+(" + "x3104c0s7b0n0:ncpus=64)+(x3105c0s13b0n0:ncpus=64)+(x3105c0s13b1n0:ncpus=64)+(x3105c0s19b0n0:ncpus=64)+(" + "x3105c0s19b1n0:ncpus=64)+(x3105c0s1b0n0:ncpus=64)+(x3105c0s1b1n0:ncpus=64)+(x3105c0s25b0n0:ncpus=64)+(" + "x3105c0s25b1n0:ncpus=64)+(x3105c0s31b0n0:ncpus=64)+(x3105c0s31b1n0:ncpus=64)+(x3105c0s37b0n0:ncpus=64)+(" + "x3105c0s37b1n0:ncpus=64)+(x3105c0s7b0n0:ncpus=64)+(x3105c0s7b1n0:ncpus=64)+(x3106c0s13b0n0:ncpus=64)+(" + "x3106c0s13b1n0:ncpus=64)+(x3106c0s19b0n0:ncpus=64)+(x3106c0s19b1n0:ncpus=64)+(x3106c0s1b0n0:ncpus=64)+(" + "x3106c0s1b1n0:ncpus=64)+(x3106c0s25b0n0:ncpus=64)+(x3106c0s25b1n0:ncpus=64)+(x3106c0s31b0n0:ncpus=64)+(" + "x3106c0s31b1n0:ncpus=64)+(x3106c0s37b0n0:ncpus=64)+(x3106c0s37b1n0:ncpus=64)+(x3106c0s7b0n0:ncpus=64)+(" + "x3106c0s7b1n0:ncpus=64)+(x3107c0s13b0n0:ncpus=64)+(x3107c0s13b1n0:ncpus=64)+(x3107c0s19b0n0:ncpus=64)+(" + "x3107c0s19b1n0:ncpus=64)+(x3107c0s25b0n0:ncpus=64)+(x3107c0s25b1n0:ncpus=64)+(x3107c0s31b0n0:ncpus=64)+(" + "x3107c0s31b1n0:ncpus=64)+(x3107c0s37b0n0:ncpus=64)+(x3107c0s37b1n0:ncpus=64)+(x3107c0s7b0n0:ncpus=64)+(" + "x3107c0s7b1n0:ncpus=64)+(x3108c0s13b0n0:ncpus=64)+(x3108c0s13b1n0:ncpus=64)+(x3108c0s19b0n0:ncpus=64)+(" + "x3108c0s19b1n0:ncpus=64)+(x3108c0s1b0n0:ncpus=64)+(x3108c0s1b1n0:ncpus=64)+(x3108c0s25b0n0:ncpus=64)+(" + "x3108c0s25b1n0:ncpus=64)+(x3108c0s31b0n0:ncpus=64)+(x3108c0s31b1n0:ncpus=64)+(x3108c0s37b0n0:ncpus=64)+(" + "x3108c0s37b1n0:ncpus=64)+(x3208c0s37b1n0:ncpus=64)+(x3208c0s7b0n0:ncpus=64) Resource_List.ncpus=4096 " + "Resource_List.nodect=64 Resource_List.place=scatter Resource_List.select=64:system=somemachine4 " + "Resource_List.system=somemachine4 Resource_List.walltime=00:15:00 session=28643 end=1652996008 Exit_status=0 " + "resources_used.cpupercent=24565 resources_used.cput=24:56:06 resources_used.mem=238265672kb " + "resources_used.ncpus=4096 resources_used.vmem=253929436kb resources_used.walltime=00:07:22 Exit_status=-20 " + "eligible_time=00:00:00 run_count=2", + { + "Exit_status": "0", + "Exit_list": "0,-20", + "Resource_List.ncpus": "4096", + "Resource_List.nodect": "64", + "Resource_List.place": "scatter", + "Resource_List.select": "64:system=somemachine4", + "Resource_List.system": "somemachine4", + "Resource_List.walltime": "00:15:00", + "ctime": "1652994845", + "eligible_time": "00:00:00", + "end": "1652996008", + "etime": "1652994845", + "exec_host": "x3104c0s19b1n0/0*64+x3104c0s1b0n0/0*64+x3104c0s1b1n0/0*64+x3104c0s25b0n0/0*64+x3104c0s25b1n0/0*64" + "+x3104c0s31b0n0/0*64+x3104c0s31b1n0/0*64+x3104c0s37b0n0/0*64+x3104c0s37b1n0/0*64+x3104c0s7b0n0/0*64" + "+x3105c0s13b0n0/0*64+x3105c0s13b1n0/0*64+x3105c0s19b0n0/0*64+x3105c0s19b1n0/0*64+x3105c0s1b0n0/0*64" + "+x3105c0s1b1n0/0*64+x3105c0s25b0n0/0*64+x3105c0s25b1n0/0*64+x3105c0s31b0n0/0*64+x3105c0s31b1n0/0*64" + "+x3105c0s37b0n0/0*64+x3105c0s37b1n0/0*64+x3105c0s7b0n0/0*64+x3105c0s7b1n0/0*64+x3106c0s13b0n0/0*64" + "+x3106c0s13b1n0/0*64+x3106c0s19b0n0/0*64+x3106c0s19b1n0/0*64+x3106c0s1b0n0/0*64+x3106c0s1b1n0/0*64" + "+x3106c0s25b0n0/0*64+x3106c0s25b1n0/0*64+x3106c0s31b0n0/0*64+x3106c0s31b1n0/0*64+x3106c0s37b0n0/0*64" + "+x3106c0s37b1n0/0*64+x3106c0s7b0n0/0*64+x3106c0s7b1n0/0*64+x3107c0s13b0n0/0*64+x3107c0s13b1n0/0*64" + "+x3107c0s19b0n0/0*64+x3107c0s19b1n0/0*64+x3107c0s25b0n0/0*64+x3107c0s25b1n0/0*64+x3107c0s31b0n0/0*64" + "+x3107c0s31b1n0/0*64+x3107c0s37b0n0/0*64+x3107c0s37b1n0/0*64+x3107c0s7b0n0/0*64+x3107c0s7b1n0/0*64" + "+x3108c0s13b0n0/0*64+x3108c0s13b1n0/0*64+x3108c0s19b0n0/0*64+x3108c0s19b1n0/0*64+x3108c0s1b0n0/0*64" + "+x3108c0s1b1n0/0*64+x3108c0s25b0n0/0*64+x3108c0s25b1n0/0*64+x3108c0s31b0n0/0*64+x3108c0s31b1n0/0*64" + "+x3108c0s37b0n0/0*64+x3108c0s37b1n0/0*64+x3208c0s37b1n0/0*64+x3208c0s7b0n0/0*64", + "exec_vnode": "(x3104c0s19b1n0:ncpus=64)+(x3104c0s1b0n0:ncpus=64)+(x3104c0s1b1n0:ncpus=64)+(" + "x3104c0s25b0n0:ncpus=64)+(x3104c0s25b1n0:ncpus=64)+(x3104c0s31b0n0:ncpus=64)+(" + "x3104c0s31b1n0:ncpus=64)+(x3104c0s37b0n0:ncpus=64)+(x3104c0s37b1n0:ncpus=64)+(" + "x3104c0s7b0n0:ncpus=64)+(x3105c0s13b0n0:ncpus=64)+(x3105c0s13b1n0:ncpus=64)+(" + "x3105c0s19b0n0:ncpus=64)+(x3105c0s19b1n0:ncpus=64)+(x3105c0s1b0n0:ncpus=64)+(" + "x3105c0s1b1n0:ncpus=64)+(x3105c0s25b0n0:ncpus=64)+(x3105c0s25b1n0:ncpus=64)+(" + "x3105c0s31b0n0:ncpus=64)+(x3105c0s31b1n0:ncpus=64)+(x3105c0s37b0n0:ncpus=64)+(" + "x3105c0s37b1n0:ncpus=64)+(x3105c0s7b0n0:ncpus=64)+(x3105c0s7b1n0:ncpus=64)+(" + "x3106c0s13b0n0:ncpus=64)+(x3106c0s13b1n0:ncpus=64)+(x3106c0s19b0n0:ncpus=64)+(" + "x3106c0s19b1n0:ncpus=64)+(x3106c0s1b0n0:ncpus=64)+(x3106c0s1b1n0:ncpus=64)+(" + "x3106c0s25b0n0:ncpus=64)+(x3106c0s25b1n0:ncpus=64)+(x3106c0s31b0n0:ncpus=64)+(" + "x3106c0s31b1n0:ncpus=64)+(x3106c0s37b0n0:ncpus=64)+(x3106c0s37b1n0:ncpus=64)+(" + "x3106c0s7b0n0:ncpus=64)+(x3106c0s7b1n0:ncpus=64)+(x3107c0s13b0n0:ncpus=64)+(" + "x3107c0s13b1n0:ncpus=64)+(x3107c0s19b0n0:ncpus=64)+(x3107c0s19b1n0:ncpus=64)+(" + "x3107c0s25b0n0:ncpus=64)+(x3107c0s25b1n0:ncpus=64)+(x3107c0s31b0n0:ncpus=64)+(" + "x3107c0s31b1n0:ncpus=64)+(x3107c0s37b0n0:ncpus=64)+(x3107c0s37b1n0:ncpus=64)+(" + "x3107c0s7b0n0:ncpus=64)+(x3107c0s7b1n0:ncpus=64)+(x3108c0s13b0n0:ncpus=64)+(" + "x3108c0s13b1n0:ncpus=64)+(x3108c0s19b0n0:ncpus=64)+(x3108c0s19b1n0:ncpus=64)+(" + "x3108c0s1b0n0:ncpus=64)+(x3108c0s1b1n0:ncpus=64)+(x3108c0s25b0n0:ncpus=64)+(" + "x3108c0s25b1n0:ncpus=64)+(x3108c0s31b0n0:ncpus=64)+(x3108c0s31b1n0:ncpus=64)+(" + "x3108c0s37b0n0:ncpus=64)+(x3108c0s37b1n0:ncpus=64)+(x3208c0s37b1n0:ncpus=64)+(x3208c0s7b0n0:ncpus=64)", + "group": "Group", + "jobname": "rfm_job", + "project": "someuser72", + "qtime": "1652994845", + "queue": "somequeue32", + "resources_used.cpupercent": "24565", + "resources_used.cput": "24:56:06", + "resources_used.mem": "238265672kb", + "resources_used.ncpus": "4096", + "resources_used.vmem": "253929436kb", + "resources_used.walltime": "00:07:22", + "run_count": "2", + "session": "28643", + "start": "1652995653", + "user": "someuser432", + }, + True, + ), + ( + "malformed command line leakage: Resources_List.-I=", + """02/23/2024 22:33:55;Q;621652.somehost;user=someuser92 group=somegroup21 + account="someproject" project=someproject jobname=hello_world.sh queue=somequeue ctime=1708727635 qtime=1708727635 + etime=0 Resource_List.allow_account_check_failure=True Resource_List.allow_negative_allocation=True + Resource_List.award_category=someword34 Resource_List.award_type=someword234 Resource_List.burn_ratio=0 + Resource_List.current_allocation=-280471872 Resource_List.hbm_mode=flat + Resource_List.hpcm_image=compute_test_20240120T003017_0051f5d Resource_List.ncpus=208 Resource_List.ni_resource=aurora + Resource_List.nodect=1 Resource_List.overburn=False Resource_List.place=scatter Resource_List.project_priority=2 + Resource_List.route_backfill=True Resource_List.select=1 Resource_List.snc_mode=quad Resource_List.total_allocation=0 + Resource_List.walltime=00:30:00 Resource_List.-I=""", + { + "Resource_List.-I": "", + "Resource_List.allow_account_check_failure": "True", + "Resource_List.allow_negative_allocation": "True", + "Resource_List.award_category": "someword34", + "Resource_List.award_type": "someword234", + "Resource_List.burn_ratio": "0", + "Resource_List.current_allocation": "-280471872", + "Resource_List.hbm_mode": "flat", + "Resource_List.hpcm_image": "compute_test_20240120T003017_0051f5d", + "Resource_List.ncpus": "208", + "Resource_List.ni_resource": "aurora", + "Resource_List.nodect": "1", + "Resource_List.overburn": "False", + "Resource_List.place": "scatter", + "Resource_List.project_priority": "2", + "Resource_List.route_backfill": "True", + "Resource_List.select": "1", + "Resource_List.snc_mode": "quad", + "Resource_List.total_allocation": "0", + "Resource_List.walltime": "00:30:00", + "account": "someproject", + "ctime": "1708727635", + "etime": "0", + "group": "somegroup21", + "jobname": "hello_world.sh", + "project": "someproject", + "qtime": "1708727635", + "queue": "somequeue", + "user": "someuser92", + }, + True, + ), + ( + "'-' in Resource name: Resources_List.foo-bar", + """02/23/2024 22:33:55;Q;621652.somehost;user=someuser92 group=somegroup21 + account="someproject" project=someproject jobname=hello_world.sh queue=somequeue ctime=1708727635 qtime=1708727635 + etime=0 Resource_List.allow_account_check_failure=True Resource_List.allow_negative_allocation=True + Resource_List.award_category=someword34 Resource_List.award_type=someword234 Resource_List.burn_ratio=0 + Resource_List.current_allocation=-280471872 Resource_List.hbm_mode=flat + Resource_List.hpcm_image=compute_test_20240120T003017_0051f5d Resource_List.ncpus=208 Resource_List.ni_resource=aurora + Resource_List.nodect=1 Resource_List.overburn=False Resource_List.place=scatter Resource_List.project_priority=2 + Resource_List.route_backfill=True Resource_List.select=1 Resource_List.snc_mode=quad Resource_List.total_allocation=0 + Resource_List.walltime=00:30:00 Resource_List.foo-bar=baz""", + { + "Resource_List.foo-bar": "baz", + "Resource_List.allow_account_check_failure": "True", + "Resource_List.allow_negative_allocation": "True", + "Resource_List.award_category": "someword34", + "Resource_List.award_type": "someword234", + "Resource_List.burn_ratio": "0", + "Resource_List.current_allocation": "-280471872", + "Resource_List.hbm_mode": "flat", + "Resource_List.hpcm_image": "compute_test_20240120T003017_0051f5d", + "Resource_List.ncpus": "208", + "Resource_List.ni_resource": "aurora", + "Resource_List.nodect": "1", + "Resource_List.overburn": "False", + "Resource_List.place": "scatter", + "Resource_List.project_priority": "2", + "Resource_List.route_backfill": "True", + "Resource_List.select": "1", + "Resource_List.snc_mode": "quad", + "Resource_List.total_allocation": "0", + "Resource_List.walltime": "00:30:00", + "account": "someproject", + "ctime": "1708727635", + "etime": "0", + "group": "somegroup21", + "jobname": "hello_world.sh", + "project": "someproject", + "qtime": "1708727635", + "queue": "somequeue", + "user": "someuser92", + }, + True, + ), + ] + if ids: + deck = [f"{i:0>2}" for i, _ in enumerate(deck)] + return deck + + +def input_parse_pbs_log_line(ids=False): + deck = [ + ( + "", + "01/17/2011 15:11:22;Q;20;queue=default", + { + "event_type": "JOB", + "sub_identifier": "0", + "action": "queue", + "scheduler_timestamp": datetime(2011, 1, 17, 15, 11, 22), + "identifier": "20", + "queue": "default", + "record": "Q", + }, + True, + ), + ( + "", + "01/17/2011 15:11:23;Q;21;queue=default", + { + "event_type": "JOB", + "sub_identifier": "0", + "action": "queue", + "scheduler_timestamp": datetime(2011, 1, 17, 15, 11, 23), + "identifier": "21", + "queue": "default", + "record": "Q", + }, + True, + ), + ( + "", + """01/17/2011 15:11:23;Q;22;queue=default""", + { + "event_type": "JOB", + "sub_identifier": "0", + "action": "queue", + "scheduler_timestamp": datetime(2011, 1, 17, 15, 11, 23), + "identifier": "22", + "queue": "default", + "record": "Q", + }, + True, + ), + ( + "3", + "01/17/2011 15:11:28;S;20;Resource_List.ncpus=8192 Resource_List.nodect=8192 Resource_List.walltime=1:16:00 args= " + "ctime=1295298682.99 cwd=/fs/somedirectoryobjB etime=1295298682.99 " + "exe=/fs/somedirectoryobjC exec_host=ANL-R30-R37-8192 group=unknown jobname=N/A " + "mode=co qtime=1295298682.99 queue=default session=unknown start=1295298688.17 user=someuser99", + { + "event_type": "JOB", + "sub_identifier": "0", + "action": "start", + "ctime": datetime(2011, 1, 17, 21, 11, 22), + "cwd": "/fs/somedirectoryobjB", + "scheduler_timestamp": datetime(2011, 1, 17, 15, 11, 28), + "etime": datetime(2011, 1, 17, 21, 11, 22), + "exe": "/fs/somedirectoryobjC", + "exec_host": ["ANL-R30-R37-8192"], + "group": "unknown", + "identifier": "20", + "jobname": "N/A", + "mode": "co", + "Resource_List.nodect": 8192, + "qtime": datetime(2011, 1, 17, 21, 11, 22), + "queue": "default", + "record": "S", + "start": datetime(2011, 1, 17, 21, 11, 28), + "user": "someuser99", + "raw_user_name": "someuser99", + "Resource_List.walltime": 4560, + "Resource_List.ncpus": 8192, + "args": "", + "session": "unknown", + }, + True, + ), + ( + "4", + "01/17/2011 15:11:39;S;21;Resource_List.ncpus=8192 Resource_List.nodect=8192 Resource_List.walltime=1:16:00 args= " + "ctime=1295298683.38 cwd=/fs/somedirectoryobj77 " + "etime=1295298683.38 exe=/fs/somedirectoryobj76 exec_host=ANL-R20-R27-8192 " + "group=unknown jobname=N/A mode=co qtime=1295298683.38 queue=default session=unknown start=1295298699.15 " + "user=someuser99", + { + "event_type": "JOB", + "sub_identifier": "0", + "action": "start", + "ctime": datetime(2011, 1, 17, 21, 11, 23), + "etime": datetime(2011, 1, 17, 21, 11, 23), + "qtime": datetime(2011, 1, 17, 21, 11, 23), + "cwd": "/fs/somedirectoryobj77", + "scheduler_timestamp": datetime(2011, 1, 17, 15, 11, 39), + "exe": "/fs/somedirectoryobj76", + "exec_host": ["ANL-R20-R27-8192"], + "group": "unknown", + "identifier": "21", + "jobname": "N/A", + "mode": "co", + "Resource_List.nodect": 8192, + "queue": "default", + "record": "S", + "start": datetime(2011, 1, 17, 21, 11, 39), + "user": "someuser99", + "raw_user_name": "someuser99", + "Resource_List.walltime": 4560, + "Resource_List.ncpus": 8192, + "args": "", + "session": "unknown", + }, + True, + ), + ( + "5", + "01/17/2011 15:13:14;E;20;Exit_status=0 Resource_List.ncpus=8192 Resource_List.nodect=8192 " + "Resource_List.walltime=1:16:00 args= ctime=1295298682.99 " + "cwd=/fs/somedirectoryobjB end=1295298794.04 etime=1295298682.99 " + "exe=/fs/somedirectoryobjC exec_host=ANL-R30-R37-8192 group=unknown jobname=N/A " + "mode=co qtime=1295298682.99 queue=default resources_used.location=ANL-R30-R37-8192 " + "resources_used.nodect=8192 resources_used.walltime=0:01:45 session=unknown start=1295298688.17 user=someuser99", + { + "event_type": "JOB", + "sub_identifier": "0", + "Exit_status": "0", + "action": "end", + "ctime": datetime(2011, 1, 17, 21, 11, 22), + "cwd": "/fs/somedirectoryobjB", + "scheduler_timestamp": datetime(2011, 1, 17, 15, 13, 14), + "etime": datetime(2011, 1, 17, 21, 11, 22), + "exe": "/fs/somedirectoryobjC", + "exec_host": ["ANL-R30-R37-8192"], + "group": "unknown", + "identifier": "20", + "jobname": "N/A", + "mode": "co", + "Resource_List.nodect": 8192, + "resources_used.nodect": 8192, + "qtime": datetime(2011, 1, 17, 21, 11, 22), + "queue": "default", + "record": "E", + "resources_used.walltime": 105, + "end": datetime(2011, 1, 17, 21, 13, 14), + "start": datetime(2011, 1, 17, 21, 11, 28), + "user": "someuser99", + "raw_user_name": "someuser99", + "Resource_List.walltime": 4560, + "Resource_List.ncpus": 8192, + "args": "", + "session": "unknown", + }, + True, + ), + ( + "6", + "01/17/2011 15:15:05;E;21;Exit_status=0 Resource_List.ncpus=8192 Resource_List.nodect=8192 " + "Resource_List.walltime=1:16:00 args= ctime=1295298683.38 " + "cwd=/fs/somedirectoryobj77 end=1295298905.96 etime=1295298683.38 " + "exe=/fs/somedirectoryobj76 exec_host=ANL-R20-R27-8192 group=unknown jobname=N/A " + "mode=co qtime=1295298683.38 queue=default resources_used.location=ANL-R20-R27-8192 " + "resources_used.nodect=8192 resources_used.walltime=0:03:26 session=unknown start=1295298699.15 user=someuser99", + { + "event_type": "JOB", + "sub_identifier": "0", + "Exit_status": "0", + "action": "end", + "ctime": datetime(2011, 1, 17, 21, 11, 23), + "etime": datetime(2011, 1, 17, 21, 11, 23), + "qtime": datetime(2011, 1, 17, 21, 11, 23), + "cwd": "/fs/somedirectoryobj77", + "scheduler_timestamp": datetime(2011, 1, 17, 15, 15, 5), + "exe": "/fs/somedirectoryobj76", + "exec_host": ["ANL-R20-R27-8192"], + "group": "unknown", + "identifier": "21", + "jobname": "N/A", + "mode": "co", + "Resource_List.nodect": 8192, + "resources_used.nodect": 8192, + "queue": "default", + "record": "E", + "resources_used.walltime": 206, + "end": datetime(2011, 1, 17, 21, 15, 5), + "start": datetime(2011, 1, 17, 21, 11, 39), + "user": "someuser99", + "raw_user_name": "someuser99", + "Resource_List.walltime": 4560, + "Resource_List.ncpus": 8192, + "args": "", + "session": "unknown", + }, + True, + ), + ( + "Job start: 1 exec_host, 2 exec_vnodes (qsub -l obj_type=gpu -l ncpus=5 -- /bin/sleep 10", + "04/16/2021 14:17:55;S;107.pdw-s1;user=someuser73 group=someuser73 project=someuser72 jobname=STDIN queue=somequeue32 " + "ctime=1618582675 qtime=1618582675 etime=1618582675 start=1618582675 exec_host=pdw-c1/0*0 exec_vnode=(" + "pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=1) Resource_List.ncpus=5 Resource_List.nodect=1 Resource_List.obj_type=gpu " + "Resource_List.place=pack Resource_List.select=1:ncpus=5:obj_type=gpu resource_assigned.ncpus=6", + { + "action": "start", + "record": "S", + "event_type": "JOB", + "identifier": "107.pdw-s1", + "scheduler_timestamp": datetime(2021, 4, 16, 14, 17, 55), + "sub_identifier": "0", + "user": "someuser73", + "raw_user_name": "someuser73", + "project": "someuser72", + "raw_project_name": "someuser72", + "queue": "somequeue32", + "jobname": "STDIN", + "group": "someuser73", + "Resource_List.nodect": 1, + "Resource_List.ncpus": 5, + # exec_host=pdw-c1/0*0 + "exec_host": ["pdw-c1"], + "raw_exec_vnode": "(pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=1)", + "exec_vnode": ["pdw-c1-v00", "pdw-c1-v01"], + "ctime": datetime(2021, 4, 16, 14, 17, 55), + "qtime": datetime(2021, 4, 16, 14, 17, 55), + "etime": datetime(2021, 4, 16, 14, 17, 55), + "start": datetime(2021, 4, 16, 14, 17, 55), + "Resource_List.place": { + "arrangement": "pack", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 1, + "ncpus": "5", + "obj_type": "gpu", + }, + ], + }, + True, + ), + ( + "Job end: 1 exec_host, 2 exec_vnodes (qsub -l obj_type=gpu -l ncpus=5 -- /bin/sleep 10)", + "04/16/2021 14:18:06;E;107.pdw-s1;user=someuser73 group=someuser73 project=someuser72 jobname=STDIN queue=somequeue32 " + "ctime=1618582675 qtime=1618582675 etime=1618582675 start=1618582675 exec_host=pdw-c1/0*0 exec_vnode=(" + "pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=1) Resource_List.ncpus=5 Resource_List.nodect=1 Resource_List.obj_type=gpu " + "Resource_List.place=pack Resource_List.select=1:ncpus=5:obj_type=gpu session=2015 end=1618582686 Exit_status=0 " + "resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=0kb resources_used.ncpus=5 " + "resources_used.vmem=0kb resources_used.walltime=00:00:10 run_count=1", + { + "action": "end", + "record": "E", + "event_type": "JOB", + "identifier": "107.pdw-s1", + "scheduler_timestamp": datetime(2021, 4, 16, 14, 18, 6), + "sub_identifier": "0", + "user": "someuser73", + "raw_user_name": "someuser73", + "project": "someuser72", + "raw_project_name": "someuser72", + "queue": "somequeue32", + "jobname": "STDIN", + "group": "someuser73", + "Resource_List.nodect": 1, + "Resource_List.ncpus": 5, + # exec_host=pdw-c1/0*0 + "exec_host": ["pdw-c1"], + "raw_exec_vnode": "(pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=1)", + "exec_vnode": ["pdw-c1-v00", "pdw-c1-v01"], + "ctime": datetime(2021, 4, 16, 14, 17, 55), + "qtime": datetime(2021, 4, 16, 14, 17, 55), + "etime": datetime(2021, 4, 16, 14, 17, 55), + "start": datetime(2021, 4, 16, 14, 17, 55), + "end": datetime(2021, 4, 16, 14, 18, 6), + "Exit_status": "0", + "resources_used.walltime": 10, + "run_count": 1, + "Resource_List.place": { + "arrangement": "pack", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 1, + "ncpus": "5", + "obj_type": "gpu", + }, + ], + "session": "2015", + }, + True, + ), + ( + "Job start: 3 exec_hosts, 21 exec_vnodes (qsub -l select=3:ncpus=11:obj_type=gpu -- /bin/sleep 10)", + "04/16/2021 18:59:22;S;117.pdw-s1;user=someuser73 group=someuser73 project=someuser72 jobname=STDIN queue=somequeue32 " + "ctime=1618599562 qtime=1618599562 etime=1618599562 start=1618599562 exec_host=pdw-c1/0*0+pdw-c2/0*0+pdw-c3/0*0 " + "exec_vnode=(pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=2+pdw-c1-v02:ncpus=1+pdw-c1-v03:ncpus=1+pdw-c1-v04:ncpus=1+pdw-c1" + "-v04-s0:ncpus=1+pdw-c1-v04-s1:ncpus=1)+(" + "pdw-c2-v00:ncpus=4+pdw-c2-v01:ncpus=2+pdw-c2-v02:ncpus=1+pdw-c2-v03:ncpus=1+pdw-c2-v04:ncpus=1+pdw-c2-v04-s0:ncpus=1" + "+pdw-c2-v04-s1:ncpus=1)+(pdw-c3-v00:ncpus=4+pdw-c3-v01:ncpus=2+pdw-c3-v02:ncpus=1+pdw-c3-v03:ncpus=1+pdw-c3-v04" + ":ncpus=1+pdw-c3-v04-s0:ncpus=1+pdw-c3-v04-s1:ncpus=1) Resource_List.ncpus=33 Resource_List.nodect=3 " + "Resource_List.place=free Resource_List.select=3:ncpus=11:obj_type=gpu resource_assigned.ncpus=33", + { + "action": "start", + "record": "S", + "event_type": "JOB", + "identifier": "117.pdw-s1", + "scheduler_timestamp": datetime(2021, 4, 16, 18, 59, 22), + "sub_identifier": "0", + "user": "someuser73", + "raw_user_name": "someuser73", + "project": "someuser72", + "raw_project_name": "someuser72", + "queue": "somequeue32", + "jobname": "STDIN", + "group": "someuser73", + "Resource_List.nodect": 3, + "Resource_List.ncpus": 33, + "exec_host": ["pdw-c1", "pdw-c2", "pdw-c3"], + "exec_vnode": [ + "pdw-c1-v00", + "pdw-c1-v01", + "pdw-c1-v02", + "pdw-c1-v03", + "pdw-c1-v04", + "pdw-c1-v04-s0", + "pdw-c1-v04-s1", + "pdw-c2-v00", + "pdw-c2-v01", + "pdw-c2-v02", + "pdw-c2-v03", + "pdw-c2-v04", + "pdw-c2-v04-s0", + "pdw-c2-v04-s1", + "pdw-c3-v00", + "pdw-c3-v01", + "pdw-c3-v02", + "pdw-c3-v03", + "pdw-c3-v04", + "pdw-c3-v04-s0", + "pdw-c3-v04-s1", + ], + "raw_exec_vnode": "(pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=2+pdw-c1-v02:ncpus=1+pdw-c1-v03:ncpus=1+pdw-c1-v04:ncpus=1" + "+pdw-c1-v04-s0:ncpus=1+pdw-c1-v04-s1:ncpus=1)+(" + "pdw-c2-v00:ncpus=4+pdw-c2-v01:ncpus=2+pdw-c2-v02:ncpus=1+pdw-c2-v03:ncpus=1+pdw-c2-v04:ncpus=1" + "+pdw-c2-v04-s0:ncpus=1+pdw-c2-v04-s1:ncpus=1)+(" + "pdw-c3-v00:ncpus=4+pdw-c3-v01:ncpus=2+pdw-c3-v02:ncpus=1+pdw-c3-v03:ncpus=1+pdw-c3-v04:ncpus=1" + "+pdw-c3-v04-s0:ncpus=1+pdw-c3-v04-s1:ncpus=1)", + "ctime": datetime(2021, 4, 16, 18, 59, 22), + "qtime": datetime(2021, 4, 16, 18, 59, 22), + "etime": datetime(2021, 4, 16, 18, 59, 22), + "start": datetime(2021, 4, 16, 18, 59, 22), + "Resource_List.place": { + "arrangement": "free", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 3, + "ncpus": "11", + "obj_type": "gpu", + }, + ], + }, + True, + ), + ( + "Job end: 3 exec_hosts, 21 exec_vnodes (qsub -l select=3:ncpus=11:obj_type=gpu -- /bin/sleep 10)", + "04/16/2021 18:59:34;E;117.pdw-s1;user=someuser73 group=someuser73 project=someuser72 jobname=STDIN queue=somequeue32 " + "ctime=1618599562 qtime=1618599562 etime=1618599562 start=1618599562 exec_host=pdw-c1/0*0+pdw-c2/0*0+pdw-c3/0*0 " + "exec_vnode=(pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=2+pdw-c1-v02:ncpus=1+pdw-c1-v03:ncpus=1+pdw-c1-v04:ncpus=1+pdw-c1" + "-v04-s0:ncpus=1+pdw-c1-v04-s1:ncpus=1)+(" + "pdw-c2-v00:ncpus=4+pdw-c2-v01:ncpus=2+pdw-c2-v02:ncpus=1+pdw-c2-v03:ncpus=1+pdw-c2-v04:ncpus=1+pdw-c2-v04-s0:ncpus=1" + "+pdw-c2-v04-s1:ncpus=1)+(pdw-c3-v00:ncpus=4+pdw-c3-v01:ncpus=2+pdw-c3-v02:ncpus=1+pdw-c3-v03:ncpus=1+pdw-c3-v04" + ":ncpus=1+pdw-c3-v04-s0:ncpus=1+pdw-c3-v04-s1:ncpus=1) Resource_List.ncpus=33 Resource_List.nodect=3 " + "Resource_List.place=free Resource_List.select=3:ncpus=11:obj_type=gpu session=2063 end=1618599574 Exit_status=0 " + "resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=864kb resources_used.ncpus=33 " + "resources_used.vmem=12916kb resources_used.walltime=00:00:10 run_count=1", + { + "action": "end", + "record": "E", + "event_type": "JOB", + "identifier": "117.pdw-s1", + "scheduler_timestamp": datetime(2021, 4, 16, 18, 59, 34), + "sub_identifier": "0", + "user": "someuser73", + "raw_user_name": "someuser73", + "project": "someuser72", + "raw_project_name": "someuser72", + "queue": "somequeue32", + "jobname": "STDIN", + "group": "someuser73", + "Resource_List.nodect": 3, + "Resource_List.ncpus": 33, + "exec_host": ["pdw-c1", "pdw-c2", "pdw-c3"], + "exec_vnode": [ + "pdw-c1-v00", + "pdw-c1-v01", + "pdw-c1-v02", + "pdw-c1-v03", + "pdw-c1-v04", + "pdw-c1-v04-s0", + "pdw-c1-v04-s1", + "pdw-c2-v00", + "pdw-c2-v01", + "pdw-c2-v02", + "pdw-c2-v03", + "pdw-c2-v04", + "pdw-c2-v04-s0", + "pdw-c2-v04-s1", + "pdw-c3-v00", + "pdw-c3-v01", + "pdw-c3-v02", + "pdw-c3-v03", + "pdw-c3-v04", + "pdw-c3-v04-s0", + "pdw-c3-v04-s1", + ], + "raw_exec_vnode": "(pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=2+pdw-c1-v02:ncpus=1+pdw-c1-v03:ncpus=1+pdw-c1-v04:ncpus=1" + "+pdw-c1-v04-s0:ncpus=1+pdw-c1-v04-s1:ncpus=1)+(" + "pdw-c2-v00:ncpus=4+pdw-c2-v01:ncpus=2+pdw-c2-v02:ncpus=1+pdw-c2-v03:ncpus=1+pdw-c2-v04:ncpus=1" + "+pdw-c2-v04-s0:ncpus=1+pdw-c2-v04-s1:ncpus=1)+(" + "pdw-c3-v00:ncpus=4+pdw-c3-v01:ncpus=2+pdw-c3-v02:ncpus=1+pdw-c3-v03:ncpus=1+pdw-c3-v04:ncpus=1" + "+pdw-c3-v04-s0:ncpus=1+pdw-c3-v04-s1:ncpus=1)", + "ctime": datetime(2021, 4, 16, 18, 59, 22), + "qtime": datetime(2021, 4, 16, 18, 59, 22), + "etime": datetime(2021, 4, 16, 18, 59, 22), + "start": datetime(2021, 4, 16, 18, 59, 22), + "end": datetime(2021, 4, 16, 18, 59, 34), + "Exit_status": "0", + "resources_used.walltime": 10, + "run_count": 1, + "Resource_List.place": { + "arrangement": "free", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 3, + "ncpus": "11", + "obj_type": "gpu", + }, + ], + "session": "2063", + }, + True, + ), + ( + "Job start: 3 exec_hosts, 9 exec_vnodes ( qsub -l select=3:ncpus=5:obj_type=gpu -- /bin/sleep 10)", + "04/18/2021 19:24:23;S;119.pdw-s1;user=someuser73 group=someuser73 project=someuser72 jobname=STDIN queue=somequeue32 " + "ctime=1618773863 qtime=1618773863 etime=1618773863 start=1618773863 exec_host=pdw-c1/0*0+pdw-c1/1*0+pdw-c2/0*0 " + "exec_vnode=(pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=1)+(" + "pdw-c1-v01:ncpus=1+pdw-c1-v02:ncpus=1+pdw-c1-v03:ncpus=1+pdw-c1-v04:ncpus=1+pdw-c1-v04-s0:ncpus=1)+(" + "pdw-c2-v00:ncpus=4+pdw-c2-v01:ncpus=1) Resource_List.ncpus=15 Resource_List.nodect=3 Resource_List.place=free " + "Resource_List.select=3:ncpus=5:obj_type=gpu resource_assigned.ncpus=16", + { + "action": "start", + "record": "S", + "event_type": "JOB", + "identifier": "119.pdw-s1", + "scheduler_timestamp": datetime(2021, 4, 18, 19, 24, 23), + "sub_identifier": "0", + "user": "someuser73", + "raw_user_name": "someuser73", + "project": "someuser72", + "raw_project_name": "someuser72", + "queue": "somequeue32", + "jobname": "STDIN", + "group": "someuser73", + "Resource_List.nodect": 3, + "Resource_List.ncpus": 15, + "exec_host": ["pdw-c1", "pdw-c1", "pdw-c2"], + "exec_vnode": [ + "pdw-c1-v00", + "pdw-c1-v01", + "pdw-c1-v01", + "pdw-c1-v02", + "pdw-c1-v03", + "pdw-c1-v04", + "pdw-c1-v04-s0", + "pdw-c2-v00", + "pdw-c2-v01", + ], + "raw_exec_vnode": "(pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=1)+(" + "pdw-c1-v01:ncpus=1+pdw-c1-v02:ncpus=1+pdw-c1-v03:ncpus=1+pdw-c1-v04:ncpus=1+pdw-c1-v04-s0" + ":ncpus=1)+(pdw-c2-v00:ncpus=4+pdw-c2-v01:ncpus=1)", + "ctime": datetime(2021, 4, 18, 19, 24, 23), + "qtime": datetime(2021, 4, 18, 19, 24, 23), + "etime": datetime(2021, 4, 18, 19, 24, 23), + "start": datetime(2021, 4, 18, 19, 24, 23), + "Resource_List.place": { + "arrangement": "free", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 3, + "ncpus": "5", + "obj_type": "gpu", + }, + ], + }, + True, + ), + ( + "Job end: 3 exec_hosts, 9 exec_vnodes ( qsub -l select=3:ncpus=5:obj_type=gpu -- /bin/sleep 10)", + "04/18/2021 19:24:34;E;119.pdw-s1;user=someuser73 group=someuser73 project=someuser72 jobname=STDIN queue=somequeue32 " + "ctime=1618773863 qtime=1618773863 etime=1618773863 start=1618773863 exec_host=pdw-c1/0*0+pdw-c1/1*0+pdw-c2/0*0 " + "exec_vnode=(pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=1)+(" + "pdw-c1-v01:ncpus=1+pdw-c1-v02:ncpus=1+pdw-c1-v03:ncpus=1+pdw-c1-v04:ncpus=1+pdw-c1-v04-s0:ncpus=1)+(" + "pdw-c2-v00:ncpus=4+pdw-c2-v01:ncpus=1) Resource_List.ncpus=15 Resource_List.nodect=3 Resource_List.place=free " + "Resource_List.select=3:ncpus=5:obj_type=gpu session=2072 end=1618773874 Exit_status=0 resources_used.cpupercent=0 " + "resources_used.cput=00:00:00 resources_used.mem=868kb resources_used.ncpus=15 resources_used.vmem=12916kb " + "resources_used.walltime=00:00:10 run_count=1", + { + "action": "end", + "record": "E", + "event_type": "JOB", + "identifier": "119.pdw-s1", + "scheduler_timestamp": datetime(2021, 4, 18, 19, 24, 34), + "sub_identifier": "0", + "user": "someuser73", + "raw_user_name": "someuser73", + "project": "someuser72", + "raw_project_name": "someuser72", + "queue": "somequeue32", + "jobname": "STDIN", + "group": "someuser73", + "Resource_List.nodect": 3, + "Resource_List.ncpus": 15, + "exec_host": ["pdw-c1", "pdw-c1", "pdw-c2"], + "exec_vnode": [ + "pdw-c1-v00", + "pdw-c1-v01", + "pdw-c1-v01", + "pdw-c1-v02", + "pdw-c1-v03", + "pdw-c1-v04", + "pdw-c1-v04-s0", + "pdw-c2-v00", + "pdw-c2-v01", + ], + "raw_exec_vnode": "(pdw-c1-v00:ncpus=4+pdw-c1-v01:ncpus=1)+(" + "pdw-c1-v01:ncpus=1+pdw-c1-v02:ncpus=1+pdw-c1-v03:ncpus=1+pdw-c1-v04:ncpus=1+pdw-c1-v04-s0" + ":ncpus=1)+(pdw-c2-v00:ncpus=4+pdw-c2-v01:ncpus=1)", + "ctime": datetime(2021, 4, 18, 19, 24, 23), + "qtime": datetime(2021, 4, 18, 19, 24, 23), + "etime": datetime(2021, 4, 18, 19, 24, 23), + "start": datetime(2021, 4, 18, 19, 24, 23), + "end": datetime(2021, 4, 18, 19, 24, 34), + "Exit_status": "0", + "resources_used.walltime": 10, + "run_count": 1, + "Resource_List.place": { + "arrangement": "free", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 3, + "ncpus": "5", + "obj_type": "gpu", + }, + ], + "session": "2072", + }, + True, + ), + ( + "13 Job start: 3 exec_hosts, 3 exec_vnodes (qsub -l select=3:ncpus=5 -- /bin/sleep 10)", + "04/21/2021 14:16:41;S;50.pdw-s1;user=someuser73 group=someuser73 project=someuser72 jobname=STDIN queue=somequeue32 " + "ctime=1619014601 qtime=1619014601 etime=1619014601 start=1619014601 exec_host=pdw-c1/0*5+pdw-c2/0*5+pdw-c3/0*5 " + "exec_vnode=(pdw-c1:ncpus=5)+(pdw-c2:ncpus=5)+(pdw-c3:ncpus=5) Resource_List.ncpus=15 Resource_List.nodect=3 " + "Resource_List.place=free Resource_List.select=3:ncpus=5 resource_assigned.mem=6106404kb resource_assigned.ncpus=24", + { + "action": "start", + "record": "S", + "event_type": "JOB", + "identifier": "50.pdw-s1", + "scheduler_timestamp": datetime(2021, 4, 21, 14, 16, 41), + "sub_identifier": "0", + "user": "someuser73", + "raw_user_name": "someuser73", + "project": "someuser72", + "raw_project_name": "someuser72", + "queue": "somequeue32", + "jobname": "STDIN", + "group": "someuser73", + "Resource_List.nodect": 3, + "Resource_List.ncpus": 15, + "exec_vnode": ["pdw-c1", "pdw-c2", "pdw-c3"], + "raw_exec_vnode": "(pdw-c1:ncpus=5)+(pdw-c2:ncpus=5)+(pdw-c3:ncpus=5)", + # exec_host=pdw-c1/0*5+pdw-c2/0*5+pdw-c3/0*5 + "exec_host": ["pdw-c1", "pdw-c2", "pdw-c3"], + "ctime": datetime(2021, 4, 21, 14, 16, 41), + "qtime": datetime(2021, 4, 21, 14, 16, 41), + "etime": datetime(2021, 4, 21, 14, 16, 41), + "start": datetime(2021, 4, 21, 14, 16, 41), + "Resource_List.place": { + "arrangement": "free", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 3, + "ncpus": "5", + }, + ], + }, + True, + ), + ( + "Job end: 3 exec_hosts, 3 exec_vnodes (qsub -l select=3:ncpus=5 -- /bin/sleep 10)", + "04/21/2021 14:16:52;E;50.pdw-s1;user=someuser73 group=someuser73 project=someuser72 jobname=STDIN queue=somequeue32 " + "ctime=1619014601 qtime=1619014601 etime=1619014601 start=1619014601 exec_host=pdw-c1/0*5+pdw-c2/0*5+pdw-c3/0*5 " + "exec_vnode=(pdw-c1:ncpus=5)+(pdw-c2:ncpus=5)+(pdw-c3:ncpus=5) Resource_List.ncpus=15 Resource_List.nodect=3 " + "Resource_List.place=free Resource_List.select=3:ncpus=5 session=1144 end=1619014612 Exit_status=0 " + "resources_used.cpupercent=0 resources_used.cput=00:00:00 resources_used.mem=848kb resources_used.ncpus=15 " + "resources_used.vmem=12916kb resources_used.walltime=00:00:10 run_count=1", + { + "action": "end", + "record": "E", + "event_type": "JOB", + "identifier": "50.pdw-s1", + "scheduler_timestamp": datetime(2021, 4, 21, 14, 16, 52), + "sub_identifier": "0", + "user": "someuser73", + "raw_user_name": "someuser73", + "project": "someuser72", + "raw_project_name": "someuser72", + "queue": "somequeue32", + "jobname": "STDIN", + "group": "someuser73", + "Resource_List.nodect": 3, + "Resource_List.ncpus": 15, + "exec_vnode": ["pdw-c1", "pdw-c2", "pdw-c3"], + "raw_exec_vnode": "(pdw-c1:ncpus=5)+(pdw-c2:ncpus=5)+(pdw-c3:ncpus=5)", + # exec_host=pdw-c1/0*5+pdw-c2/0*5+pdw-c3/0*5 + "exec_host": ["pdw-c1", "pdw-c2", "pdw-c3"], + "ctime": datetime(2021, 4, 21, 14, 16, 41), + "qtime": datetime(2021, 4, 21, 14, 16, 41), + "etime": datetime(2021, 4, 21, 14, 16, 41), + "start": datetime(2021, 4, 21, 14, 16, 41), + "end": datetime(2021, 4, 21, 14, 16, 52), + "Exit_status": "0", + "resources_used.walltime": 10, + "run_count": 1, + "Resource_List.place": { + "arrangement": "free", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 3, + "ncpus": "5", + }, + ], + "session": "1144", + }, + True, + ), + ( + "Job end: 2 exec_hosts, 2 exec_vnodes (jobname=pbs_job_script.sh)", + '04/21/2021 18:46:27;E;0.pdw-s1;user=someuser73 group=someuser73 account="account1" project=project1 ' + "jobname=pbs_job_script.sh queue=somequeue32 ctime=1619030783 qtime=1619030783 etime=1619030783 start=1619030783 " + "exec_host=pdw-c1/0*2+pdw-c1/1*2 exec_vnode=(pdw-c1:ncpus=2)+(pdw-c1:ncpus=2) Resource_List.ncpus=4 " + "Resource_List.nodect=2 Resource_List.place=free Resource_List.select=2:ncpus=2 Resource_List.walltime=00:05:00 " + "session=561 end=1619030787 Exit_status=0 resources_used.cpupercent=0 resources_used.cput=00:00:00 " + "resources_used.mem=0kb resources_used.ncpus=4 resources_used.vmem=0kb resources_used.walltime=00:00:03 run_count=1", + { + "Exit_status": "0", + "Resource_List.ncpus": 4, + "Resource_List.nodect": 2, + "Resource_List.walltime": 300, + "action": "end", + "ctime": datetime(2021, 4, 21, 18, 46, 23), + "end": datetime(2021, 4, 21, 18, 46, 27), + "etime": datetime(2021, 4, 21, 18, 46, 23), + "event_type": "JOB", + # exec_host=pdw-c1/0*2+pdw-c1/1*2 + "exec_host": ["pdw-c1", "pdw-c1"], + "raw_exec_vnode": "(pdw-c1:ncpus=2)+(pdw-c1:ncpus=2)", + "exec_vnode": ["pdw-c1", "pdw-c1"], + "group": "someuser73", + "identifier": "0.pdw-s1", + "jobname": "pbs_job_script.sh", + "qtime": datetime(2021, 4, 21, 18, 46, 23), + "queue": "somequeue32", + "project": "project1", + "raw_project_name": "project1", + "account": "account1", + "raw_account_name": "account1", + "raw_user_name": "someuser73", + "record": "E", + "resources_used.walltime": 3, + "scheduler_timestamp": datetime(2021, 4, 21, 18, 46, 27), + "start": datetime(2021, 4, 21, 18, 46, 23), + "sub_identifier": "0", + "user": "someuser73", + "run_count": 1, + "Resource_List.place": { + "arrangement": "free", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 2, + "ncpus": "2", + }, + ], + "session": "561", + }, + True, + ), + ( + "Job start: 3 exec_hosts, 3 exec_vnodes (jobname=pbs_job_script.sh)", + "04/21/2021 18:48:25;S;7.pdw-s1;user=someuser73 group=someuser73 project=someuser72 jobname=pbs_job_script.sh " + "queue=somequeue32 ctime=1619030785 qtime=1619030785 etime=1619030785 start=1619030905 " + "exec_host=pdw-c3/0*3+pdw-c3/1*3+pdw-c4/0*3 exec_vnode=(pdw-c3:ncpus=3:mem=1kb)+(pdw-c3:ncpus=3:mem=1kb)+(" + "pdw-c4:ncpus=3:mem=1kb) Resource_List.mem=9b Resource_List.ncpus=9 Resource_List.nodect=3 Resource_List.place=free " + "Resource_List.select=3:ncpus=3:mem=3 Resource_List.walltime=00:05:00 resource_assigned.mem=4070936kb " + "resource_assigned.ncpus=16", + { + "Resource_List.ncpus": 9, + "Resource_List.nodect": 3, + "Resource_List.walltime": 300, + "action": "start", + "ctime": datetime(2021, 4, 21, 18, 46, 25), + "etime": datetime(2021, 4, 21, 18, 46, 25), + "event_type": "JOB", + # exec_host=pdw-c3/0*3+pdw-c3/1*3+pdw-c4/0*3 + "exec_host": ["pdw-c3", "pdw-c3", "pdw-c4"], + "exec_vnode": ["pdw-c3", "pdw-c3", "pdw-c4"], + "raw_exec_vnode": "(pdw-c3:ncpus=3:mem=1kb)+(pdw-c3:ncpus=3:mem=1kb)+(pdw-c4:ncpus=3:mem=1kb)", + "group": "someuser73", + "identifier": "7.pdw-s1", + "jobname": "pbs_job_script.sh", + "qtime": datetime(2021, 4, 21, 18, 46, 25), + "queue": "somequeue32", + "project": "someuser72", + "raw_project_name": "someuser72", + "raw_user_name": "someuser73", + "record": "S", + "scheduler_timestamp": datetime(2021, 4, 21, 18, 48, 25), + "start": datetime(2021, 4, 21, 18, 48, 25), + "sub_identifier": "0", + "user": "someuser73", + "Resource_List.place": { + "arrangement": "free", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 3, + "mem": "3", + "ncpus": "3", + }, + ], + }, + True, + ), + ( + "Job end: 2 exec_hosts, 2 exec_vnodes (jobname=pbs_job_script_16.sh)", + '04/22/2021 16:16:04;E;1.pdw-s1;user=someuser73 group=someuser73 account="account2" project=project2 ' + "jobname=pbs_job_script_16.sh queue=somequeue32 ctime=1619108159 qtime=1619108159 etime=1619108159 start=1619108159 " + "exec_host=pdw-c2/0*8+pdw-c3/0*8 exec_vnode=(pdw-c2:ncpus=8)+(pdw-c3:ncpus=8) Resource_List.ncpus=16 " + "Resource_List.nodect=2 Resource_List.place=free Resource_List.select=2:ncpus=8 Resource_List.walltime=00:05:00 " + "session=562 end=1619108164 Exit_status=0 resources_used.cpupercent=0 resources_used.cput=00:00:00 " + "resources_used.mem=820kb resources_used.ncpus=16 resources_used.vmem=12916kb resources_used.walltime=00:00:03 " + "run_count=1", + { + "Exit_status": "0", + "Resource_List.ncpus": 16, + "Resource_List.nodect": 2, + "Resource_List.walltime": 300, + "action": "end", + "ctime": datetime(2021, 4, 22, 16, 15, 59), + "end": datetime(2021, 4, 22, 16, 16, 4), + "etime": datetime(2021, 4, 22, 16, 15, 59), + "event_type": "JOB", + # exec_host=pdw-c2/0*8+pdw-c3/0*8 + "exec_host": ["pdw-c2", "pdw-c3"], + "raw_exec_vnode": "(pdw-c2:ncpus=8)+(pdw-c3:ncpus=8)", + "exec_vnode": ["pdw-c2", "pdw-c3"], + "group": "someuser73", + "identifier": "1.pdw-s1", + "jobname": "pbs_job_script_16.sh", + "qtime": datetime(2021, 4, 22, 16, 15, 59), + "queue": "somequeue32", + "account": "account2", + "raw_account_name": "account2", + "project": "project2", + "raw_project_name": "project2", + "raw_user_name": "someuser73", + "record": "E", + "resources_used.walltime": 3, + "scheduler_timestamp": datetime(2021, 4, 22, 16, 16, 4), + "start": datetime(2021, 4, 22, 16, 15, 59), + "sub_identifier": "0", + "user": "someuser73", + "run_count": 1, + "Resource_List.place": { + "arrangement": "free", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 2, + "ncpus": "8", + }, + ], + "session": "562", + }, + True, + ), + ( + "18 Job start: (jobname=N/A mode=script)", + "02/14/2020 23:59:33;S;409766;Resource_List.ncpus=4 Resource_List.nodect=4 Resource_List.walltime=0:30:00 " + "account=funny args= ctime=1581722605.06 cwd=/fs/somedirectoryobj826 etime=1581722605.06 " + 'exe=/fs/somedirectoryobj825 exec_host="3824-3826,3829" group=unknown jobname=N/A ' + "mode=script qtime=1581722605.06 queue=somequeue79 resource=somemachine3 session=unknown start=1581724773.64 " + "user=someuser234", + { + "Resource_List.ncpus": 4, + "Resource_List.nodect": 4, + "Resource_List.walltime": 1800, + "action": "start", + "ctime": datetime(2020, 2, 14, 23, 23, 25), + "cwd": "/fs/somedirectoryobj826", + "etime": datetime(2020, 2, 14, 23, 23, 25), + "event_type": "JOB", + "exe": "/fs/somedirectoryobj825", + # exec_host="3824-3826,3829" + "exec_host": ["3824-3826,3829"], + "group": "unknown", + "identifier": "409766", + "jobname": "N/A", + "mode": "script", + "qtime": datetime(2020, 2, 14, 23, 23, 25), + "queue": "somequeue79", + "account": "funny", + "raw_account_name": "funny", + "user": "someuser234", + "raw_user_name": "someuser234", + "record": "S", + "resource": "somemachine3", + "scheduler_timestamp": datetime(2020, 2, 14, 23, 59, 33), + "start": datetime(2020, 2, 14, 23, 59, 33), + "sub_identifier": "0", + "args": "", + "session": "unknown", + }, + True, + ), + ( + "19 suspend", + "11/29/2022 19:56:18;z;54095.somepbshost;resources_used.cpupercent=0 resources_used.cput=00:00:00 " + "resources_used.mem=0b resources_used.ncpus=1 resources_used.vmem=0kb resources_used.walltime=00:00:00", + { + "action": "suspend", + "event_type": "JOB", + "identifier": "54095.somepbshost", + "record": "z", + "resources_used.cpupercent": "0", + "resources_used.cput": "00:00:00", + "resources_used.mem": "0b", + "resources_used.ncpus": "1", + "resources_used.vmem": "0kb", + "resources_used.walltime": 0, + "scheduler_timestamp": datetime(2022, 11, 29, 19, 56, 18), + "sub_identifier": "0", + }, + True, + ), + ( + "20 resume", + "11/29/2022 19:56:34;r;54095.somepbshost;", + { + "action": "resume", + "event_type": "JOB", + "identifier": "54095.somepbshost", + "record": "r", + "scheduler_timestamp": datetime(2022, 11, 29, 19, 56, 34), + "sub_identifier": "0", + }, + True, + ), + ( + "21 log lines with different nodes used than requested", + "01/17/2013 21:01:10;Q;50888;queue=somequeue78", + { + "event_type": "JOB", + "sub_identifier": "0", + "action": "queue", + "identifier": "50888", + "queue": "somequeue78", + "record": "Q", + "scheduler_timestamp": datetime(2013, 1, 17, 21, 1, 10), + }, + True, + ), + ( + "22 Fix for PBS_Utils.pbs_accounting.BadLineError A", + "10/26/2021 16:24:55;D;52.pdw-s1;requestor=someuser73@pdw-s1.pdw.local", + { + "action": "delete", + "record": "D", + "event_type": "JOB", + "identifier": "52.pdw-s1", + "scheduler_timestamp": datetime(2021, 10, 26, 16, 24, 55), + "sub_identifier": "0", + "requester": "someuser73@pdw-s1.pdw.local", + }, + True, + ), + ( + "23 Fix for PBS_Utils.pbs_accounting.BadLineError B", + "10/26/2021 16:24:55;A;53.pdw-s1;Job deleted as result of dependency on job 52.pdw-s1", + { + "action": "abort", + "record": "A", + "event_type": "JOB", + "identifier": "53.pdw-s1", + "scheduler_timestamp": datetime(2021, 10, 26, 16, 24, 55), + "sub_identifier": "0", + }, + True, + ), + ( + "24 job array end", + '05/10/2023 17:33:35;E;5[].pdw-s1;user=someuser73 group=someuser73 account="account3" project=Project3 jobname=name3 ' + "queue=somequeue32 ctime=1683740001 qtime=1683740001 etime=1683740001 start=1683740001 Resource_List.ncpus=6 " + "Resource_List.nodect=2 Resource_List.place=free Resource_List.select=2:ncpus=3 Resource_List.walltime=00:05:00 " + "session=0 end=1683740015 Exit_status=0 run_count=0", + { + "Exit_status": "0", + "Resource_List.ncpus": 6, + "Resource_List.nodect": 2, + "Resource_List.walltime": 300, + "action": "end", + "ctime": datetime(2023, 5, 10, 17, 33, 21), + "end": datetime(2023, 5, 10, 17, 33, 35), + "etime": datetime(2023, 5, 10, 17, 33, 21), + "event_type": "JOB", + "group": "someuser73", + "identifier": "5[].pdw-s1", + "jobname": "name3", + "qtime": datetime(2023, 5, 10, 17, 33, 21), + "queue": "somequeue32", + "account": "account3", + "raw_account_name": "account3", + "project": "Project3", + "raw_project_name": "Project3", + "raw_user_name": "someuser73", + "record": "E", + "run_count": 0, + "scheduler_timestamp": datetime(2023, 5, 10, 17, 33, 35), + "start": datetime(2023, 5, 10, 17, 33, 21), + "sub_identifier": "0", + "user": "someuser73", + "Resource_List.place": { + "arrangement": "free", + "groups": [], + "sharing": None, + }, + "Resource_List.select": [ + { + "N": 2, + "ncpus": "3", + }, + ], + "session": "0", + }, + True, + ), + ( + "25 job with one quote", + """10/24/2023 17:31:14;S;674670;Resource_List.ncpus=2 Resource_List.nodect=2 Resource_List.walltime=1:00:00 + account=account831 args= ctime=1698168671.61 cwd=/fs/user827/MA etime=1698168671.61 + exe=/fs/somedirectoryobj827 exec_host="3828,3837" group=unknown jobname=somemachine3_debug_1_core.txt' mode=script + qtime=1698168671.61 queue=somequeue1 resource=somemachine3 session=unknown start=1698168674.61 user=user827""", + { + "Resource_List.ncpus": 2, + "Resource_List.nodect": 2, + "Resource_List.walltime": 3600, + "account": "account831", + "action": "start", + "ctime": datetime(2023, 10, 24, 17, 31, 11), + "cwd": "/fs/user827/MA", + "etime": datetime(2023, 10, 24, 17, 31, 11), + "event_type": "JOB", + "exe": "/fs/somedirectoryobj827", + "exec_host": ["3828,3837"], + "group": "unknown", + "identifier": "674670", + "jobname": "somemachine3_debug_1_core.txt'", + "mode": "script", + "qtime": datetime(2023, 10, 24, 17, 31, 11), + "queue": "somequeue1", + "raw_account_name": "account831", + "raw_user_name": "user827", + "record": "S", + "resource": "somemachine3", + "scheduler_timestamp": datetime(2023, 10, 24, 17, 31, 14), + "start": datetime(2023, 10, 24, 17, 31, 14), + "sub_identifier": "0", + "user": "user827", + "args": "", + "session": "unknown", + }, + True, + ), + ( + "26 job with one quote", + """10/24/2023 18:10:32;E;674670;Exit_status=0 Resource_List.ncpus=2 Resource_List.nodect=2 + Resource_List.walltime=1:00:00 account=account831 args= ctime=1698168671.61 cwd=/fs/user827/MA + end=1698171032.04 etime=1698168671.61 exe=/fs/somedirectoryobj827 exec_host="3828,3837" group=unknown + jobname=somemachine3_debug_1_core.txt' mode=script qtime=1698168671.61 queue=somequeue1 resource=somemachine3 + resources_used.location="3828,3837" resources_used.nodect=2 resources_used.walltime=0:39:17 session=unknown + start=1698168674.61 user=user827""", + { + "Exit_status": "0", + "Resource_List.ncpus": 2, + "Resource_List.nodect": 2, + "Resource_List.walltime": 3600, + "account": "account831", + "action": "end", + "ctime": datetime(2023, 10, 24, 17, 31, 11), + "cwd": "/fs/user827/MA", + "end": datetime(2023, 10, 24, 18, 10, 32), + "etime": datetime(2023, 10, 24, 17, 31, 11), + "event_type": "JOB", + "exe": "/fs/somedirectoryobj827", + "exec_host": ["3828,3837"], + "group": "unknown", + "identifier": "674670", + "jobname": "somemachine3_debug_1_core.txt'", + "mode": "script", + "qtime": datetime(2023, 10, 24, 17, 31, 11), + "queue": "somequeue1", + "raw_account_name": "account831", + "raw_user_name": "user827", + "record": "E", + "resource": "somemachine3", + "resources_used.nodect": 2, + "resources_used.walltime": 2357, + "scheduler_timestamp": datetime(2023, 10, 24, 18, 10, 32), + "start": datetime(2023, 10, 24, 17, 31, 14), + "sub_identifier": "0", + "user": "user827", + "args": "", + "session": "unknown", + }, + True, + ), + ( + "27 walltime test", + """10/26/2020 22:10:50;E;475302;Exit_status=2 Resource_List.ncpus=128 Resource_List.nodect=128 + Resource_List.walltime=00:10:00""", + { + "Exit_status": "2", + "Resource_List.ncpus": 128, + "Resource_List.nodect": 128, + "Resource_List.walltime": 600, + "action": "end", + "event_type": "JOB", + "identifier": "475302", + "record": "E", + "scheduler_timestamp": datetime(2020, 10, 26, 22, 10, 50), + "sub_identifier": "0", + }, + True, + ), + ( + "28 this is an invalid key, which is part of walltime." + "Resource_List.walltime=00:10:00 Resource_List.select:2:ncpus=128:mpiprocs=128", + """10/26/2023 08:41:14;Q;1702.imgt1;user=user968 group=somegroup21 account="someproject23" project=someproject23 + jobname=hello_world queue=somequeue65 ctime=1698327674 qtime=1698327674 etime=0 + Resource_List.allow_account_check_failure=True Resource_List.allow_negative_allocation=True + Resource_List.award_category=someword37 Resource_List.award_type=someword37 Resource_List.burn_ratio=0 + Resource_List.current_allocation=460795510784 Resource_List.ncpus=128 Resource_List.ni_resource=improv + Resource_List.nodect=1 Resource_List.overburn=False Resource_List.place=pack Resource_List.project_priority=2 + Resource_List.route_backfill=False Resource_List.select=1:ncpus=128 Resource_List.total_allocation=460800000000 + Resource_List.walltime=00:10:00 Resource_List.select:2:ncpus=128:mpiprocs=128""", + Exception, + False, + ), + ( + "malformed command line leakage: Resources_List.-I=", + """02/23/2024 22:33:55;Q;621652.somehost;user=someuser92 group=somegroup20 + account="someproject" project=someproject jobname=hello_world.sh queue=somequeue ctime=1708727635 qtime=1708727635 + etime=0 Resource_List.allow_account_check_failure=True Resource_List.allow_negative_allocation=True + Resource_List.award_category=someword34 Resource_List.award_type=someword234 Resource_List.burn_ratio=0 + Resource_List.current_allocation=-280471872 Resource_List.hbm_mode=flat + Resource_List.hpcm_image=compute_test_20240120T003017_0051f5d Resource_List.ncpus=208 Resource_List.ni_resource=aurora + Resource_List.nodect=1 Resource_List.overburn=False Resource_List.place=scatter Resource_List.project_priority=2 + Resource_List.route_backfill=True Resource_List.select=1 Resource_List.snc_mode=quad Resource_List.total_allocation=0 + Resource_List.walltime=00:30:00 Resource_List.-I=""", + { + "Resource_List.ncpus": 208, + "Resource_List.nodect": 1, + "Resource_List.place": {"arrangement": "scatter", "groups": [], "sharing": None}, + "Resource_List.select": [{"N": 1}], + "Resource_List.walltime": 1800, + "account": "someproject", + "action": "queue", + "ctime": datetime(2024, 2, 23, 22, 33, 55), + "event_type": "JOB", + "group": "somegroup20", + "identifier": "621652.somehost", + "jobname": "hello_world.sh", + "project": "someproject", + "qtime": datetime(2024, 2, 23, 22, 33, 55), + "queue": "somequeue", + "raw_account_name": "someproject", + "raw_project_name": "someproject", + "raw_user_name": "someuser92", + "record": "Q", + "scheduler_timestamp": datetime(2024, 2, 23, 22, 33, 55), + "sub_identifier": "0", + "user": "someuser92", + }, + True, + ), + ( + "'-' in Resource name: Resources_List.foo-bar", + """02/23/2024 22:33:55;Q;621652.somehost;user=someuser92 group=somegroup21 + account="someproject" project=someproject jobname=hello_world.sh queue=somequeue ctime=1708727635 qtime=1708727635 + etime=0 Resource_List.allow_account_check_failure=True Resource_List.allow_negative_allocation=True + Resource_List.award_category=someword34 Resource_List.award_type=someword234 Resource_List.burn_ratio=0 + Resource_List.current_allocation=-280471872 Resource_List.hbm_mode=flat + Resource_List.hpcm_image=compute_test_20240120T003017_0051f5d Resource_List.ncpus=208 Resource_List.ni_resource=aurora + Resource_List.nodect=1 Resource_List.overburn=False Resource_List.place=scatter Resource_List.project_priority=2 + Resource_List.route_backfill=True Resource_List.select=1 Resource_List.snc_mode=quad Resource_List.total_allocation=0 + Resource_List.walltime=00:30:00 Resource_List.foo-bar=baz""", + { + "Resource_List.ncpus": 208, + "Resource_List.nodect": 1, + "Resource_List.place": {"arrangement": "scatter", "groups": [], "sharing": None}, + "Resource_List.select": [{"N": 1}], + "Resource_List.walltime": 1800, + "account": "someproject", + "action": "queue", + "ctime": datetime(2024, 2, 23, 22, 33, 55), + "event_type": "JOB", + "group": "somegroup21", + "identifier": "621652.somehost", + "jobname": "hello_world.sh", + "project": "someproject", + "qtime": datetime(2024, 2, 23, 22, 33, 55), + "queue": "somequeue", + "raw_account_name": "someproject", + "raw_project_name": "someproject", + "raw_user_name": "someuser92", + "record": "Q", + "scheduler_timestamp": datetime(2024, 2, 23, 22, 33, 55), + "sub_identifier": "0", + "user": "someuser92", + }, + True, + ), + ( + "malformed command line leakage: Resource_List.selec816:ncpus=208", + """02/22/2024 04:30:47;Q;620255.somehost;user=someuser45 group=somegroup21 + account="someproject" project=someproject jobname=STDIN queue=somequeue ctime=1708576247 qtime=1708576247 etime=0 + Resource_List.allow_account_check_failure=True Resource_List.allow_negative_allocation=True + Resource_List.award_category=someword34 Resource_List.award_type=someword34 Resource_List.burn_ratio=0 + Resource_List.current_allocation=-288779960320 Resource_List.hbm_mode=flat + Resource_List.hpcm_image=compute_test_20240120T003017_0051f5d Resource_List.ncpus=1 Resource_List.ni_resource=aurora + Resource_List.nodect=1 Resource_List.overburn=False Resource_List.place=pack Resource_List.project_priority=2 + Resource_List.route_backfill=False Resource_List.select=1:ncpus=1 Resource_List.snc_mode=quad + Resource_List.total_allocation=0 + Resource_List.walltime=04:00:00 Resource_List.selec816:ncpus=208""", + Exception, + False, + ), + ( + "line with resvname", + '01/21/2023 22:10:18;E;396595.somehost;user=someuser322 group=somegroup21 account="RL-fold" project=projecta ' + "jobname=submit.sh queue=someresv23 resvname=ResvName resvID=someresv23.somehost ctime=1674338973 " + "qtime=1674338974 etime=1674338974 start=1674338976 exec_host=x3006c0s13b1n0/0*64+x3006c0s19b0n0/0*64 exec_vnode=(" + "x3006c0s13b1n0:ncpus=64)+(x3006c0s19b0n0:ncpus=64) Resource_List.allow_account_check_failure=True " + "Resource_List.allow_negative_allocation=True Resource_List.award_category=someword34 " + "Resource_List.award_type=someword34 Resource_List.backfill_factor=84600 Resource_List.backfill_max=50 " + "Resource_List.base_score=0 Resource_List.burn_ratio=0.2028 Resource_List.current_allocation=4018003968 " + "Resource_List.eagle_fs=True Resource_List.enable_backfill=0 Resource_List.enable_fifo=0 Resource_List.enable_wfp=0 " + "Resource_List.fifo_factor=1800 Resource_List.filesystems=home:eagle Resource_List.home_fs=True " + "Resource_List.ncpus=128 Resource_List.ni_resource=somemachine4 Resource_List.nodect=2 Resource_List.overburn=False " + "Resource_List.place=scatter Resource_List.preempt_targets=NONE Resource_List.project_priority=2 " + "Resource_List.route_backfill=False Resource_List.score_boost=0 Resource_List.select=2:system=somemachine4 " + "Resource_List.total_allocation=5040000000 Resource_List.total_cpus=560 Resource_List.walltime=00:30:00 " + "Resource_List.wfp_factor=100000 session=0 end=1674339018 Exit_status=1 resources_used.cpupercent=0 " + "resources_used.cput=00:00:00 resources_used.mem=3880kb resources_used.ncpus=128 resources_used.vmem=16212kb " + "resources_used.walltime=00:00:01 eligible_time=00:00:08 run_count=1", + { + "Exit_status": "1", + "Resource_List.ncpus": 128, + "Resource_List.nodect": 2, + "Resource_List.place": {"arrangement": "scatter", "groups": [], "sharing": None}, + "Resource_List.select": [{"N": 2, "system": "somemachine4"}], + "Resource_List.walltime": 1800, + "account": "RL-fold", + "action": "end", + "ctime": datetime(2023, 1, 21, 22, 9, 33), + "end": datetime(2023, 1, 21, 22, 10, 18), + "etime": datetime(2023, 1, 21, 22, 9, 34), + "event_type": "JOB", + "exec_host": ["x3006c0s13b1n0", "x3006c0s19b0n0"], + "exec_vnode": ["x3006c0s13b1n0", "x3006c0s19b0n0"], + "group": "somegroup21", + "identifier": "396595.somehost", + "jobname": "submit.sh", + "project": "projecta", + "qtime": datetime(2023, 1, 21, 22, 9, 34), + "queue": "someresv23", + "raw_account_name": "RL-fold", + "raw_exec_vnode": "(x3006c0s13b1n0:ncpus=64)+(x3006c0s19b0n0:ncpus=64)", + "raw_project_name": "projecta", + "raw_user_name": "someuser322", + "record": "E", + "resources_used.walltime": 1, + "resvID": "someresv23.somehost", + "resvname": "ResvName", + "run_count": 1, + "scheduler_timestamp": datetime(2023, 1, 21, 22, 10, 18), + "session": "0", + "start": datetime(2023, 1, 21, 22, 9, 36), + "sub_identifier": "0", + "user": "someuser322", + }, + True, + ), + ( + "line with resvname", + "07/25/2023 23:54:37;E;562207.somemachine4-pbs-01.hsn.cm.somemachine4;user=someuser93 group=somegroup23 " + 'account="someproject2" project=someproject2 jobname=HACC_0512_C04_MPI04_GPU04_job queue=somequeue23 ctime=1690299801 ' + "qtime=1690299801 etime=1690299801 start=1690328787 exec_host=x3013c0s31b0n0/0*64 exec_vnode=(" + "x3013c0s31b0n0:ncpus=64) Resource_List.allow_account_check_failure=True Resource_List.allow_negative_allocation=True " + "Resource_List.award_category=someword34 Resource_List.award_type=someword34 " + "Resource_List.backfill_factor=84600 Resource_List.backfill_max=50 Resource_List.base_score=51 " + "Resource_List.burn_ratio=0.0809 Resource_List.current_allocation=9264516096 Resource_List.enable_backfill=0 " + "Resource_List.enable_fifo=1 Resource_List.enable_wfp=0 Resource_List.fifo_factor=1800 " + "Resource_List.filesystems=grand Resource_List.grand_fs=True Resource_List.ncpus=64 Resource_List.ni_resource=somemachine4 " + "Resource_List.nodect=1 Resource_List.overburn=False Resource_List.place=scatter Resource_List.preempt_targets=NONE " + "Resource_List.project_priority=20 Resource_List.route_backfill=False Resource_List.score_boost=0 " + "Resource_List.select=1 Resource_List.total_allocation=10080000000 Resource_List.total_cpus=560 " + "Resource_List.walltime=00:30:00 Resource_List.wfp_factor=100000 session=49469 end=1690329277 Exit_status=0 " + "resources_used.cpupercent=357 resources_used.cput=00:29:05 resources_used.mem=22536524kb resources_used.ncpus=64 " + "resources_used.vmem=98586472kb resources_used.walltime=00:08:03 eligible_time=08:03:08 run_count=1", + { + "Exit_status": "0", + "Resource_List.ncpus": 64, + "Resource_List.nodect": 1, + "Resource_List.place": {"arrangement": "scatter", "groups": [], "sharing": None}, + "Resource_List.select": [{"N": 1}], + "Resource_List.walltime": 1800, + "account": "someproject2", + "action": "end", + "ctime": datetime(2023, 7, 25, 15, 43, 21), + "end": datetime(2023, 7, 25, 23, 54, 37), + "etime": datetime(2023, 7, 25, 15, 43, 21), + "event_type": "JOB", + "exec_host": ["x3013c0s31b0n0"], + "exec_vnode": ["x3013c0s31b0n0"], + "group": "somegroup23", + "identifier": "562207.somemachine4-pbs-01.hsn.cm.somemachine4", + "jobname": "HACC_0512_C04_MPI04_GPU04_job", + "project": "someproject2", + "qtime": datetime(2023, 7, 25, 15, 43, 21), + "queue": "somequeue23", + "raw_account_name": "someproject2", + "raw_exec_vnode": "(x3013c0s31b0n0:ncpus=64)", + "raw_project_name": "someproject2", + "raw_user_name": "someuser93", + "record": "E", + "resources_used.walltime": 483, + "run_count": 1, + "scheduler_timestamp": datetime(2023, 7, 25, 23, 54, 37), + "session": "49469", + "start": datetime(2023, 7, 25, 23, 46, 27), + "sub_identifier": "0", + "user": "someuser93", + }, + True, + ), + ] + if ids: + deck = [f"{i:0>2}" for i, _ in enumerate(deck)] + return deck + + +@pytest.mark.parametrize("line,valid_lt_line", input_pbs_line_extract_time(), ids=input_pbs_line_extract_time(ids=True)) +def test_parse_pbs_log_line_extract_time(line, valid_lt_line): + correct_logtime, correct_data = valid_lt_line + logtime, data = pbs_line_extract_time(line) + assert logtime == correct_logtime, f"RESULT:{logtime} != CORRECTRESULT:{correct_logtime}" + assert data == correct_data, f"RESULT:{logtime} != CORRECTRESULT:{correct_data}" + dct = parse_pbs_log_line(logtime, data) + assert dct["record"] in ["S", "D", "Q", "E", "A"] + assert dct["identifier"] is not None + result_line = create_pbs_log_line(logtime, dct["record"], dct["identifier"], dct) + new_logtime, new_data = pbs_line_extract_time(result_line) + new_dct = parse_pbs_log_line(new_logtime, new_data) + assert dct == new_dct + + +@pytest.mark.parametrize( + "comment,line,valid_record,valid_data", input_parse_pbs_log_line(), ids=input_parse_pbs_log_line(ids=True) +) +def test_parse_pbs_log_line(comment, line, valid_record, valid_data): + default_resource = "someresource" + logtime, data = pbs_line_extract_time(line) + if valid_record == BadLineError: + with pytest.raises(BadLineError): + parse_pbs_log_line(logtime, data, resource=default_resource) + elif valid_record == Exception: + with pytest.raises(Exception): + parse_pbs_log_line(logtime, data, resource=default_resource) + else: + result = parse_pbs_log_line(logtime, data, resource=default_resource) + pprint(result, width=132) + # if 'resource' in result and result['resource'] != default_resource: + # # this is the case where the resource doesn't equal the default given resource. + # pass + if " resource=" not in data: + # add it in the records that should have it now. + valid_record["resource"] = default_resource + for key in result.keys(): + if key not in FieldHandlers.builtin_keys and key not in FieldHandlers.encode_passthrough_lst: + assert key in data + assert result == valid_record, pformat(result, width=132) + + +@pytest.mark.parametrize( + "comment,line,valid_record,valid_data", input_parse_pbs_log_line(), ids=input_parse_pbs_log_line(ids=True) +) +def test_parse_pbs_log_line_with_resource(comment, line, valid_record, valid_data): + logtime, data = pbs_line_extract_time(line) + if valid_record == BadLineError: + with pytest.raises(BadLineError): + parse_pbs_log_line(logtime, data) + elif valid_record == Exception: + with pytest.raises(Exception): + parse_pbs_log_line(logtime, data) + else: + result = parse_pbs_log_line(logtime, data) + pprint(result, width=132) + assert result == valid_record, pformat(result, width=132) + + +def check_kv_pair_records(src, dst): + # these need to be split but account for quotes. + assert parse_key_value_pairs(src) == parse_key_value_pairs(dst) + + +# fixme: add a bad data marker. +@pytest.mark.parametrize("comment,line,valid_result,valid_data", input_parse_kv_pairs(), ids=input_parse_kv_pairs(ids=True)) +def test_pbs_parse_key_value_pairs(comment, line, valid_result, valid_data): + logtime, data = pbs_line_extract_time(line) + record_type, identifier, record_keyvals = split_record(data) + if valid_result == BadLineError: + with pytest.raises(BadLineError): + parse_key_value_pairs(record_keyvals) + else: + result = parse_key_value_pairs(record_keyvals) + for key, value in valid_result.items(): + assert value == result[key], f"{key}: {value} != {result[key]}" + assert result == valid_result, f"{line=}\n{pformat(result)=}\n != \n {pformat(valid_result)=}" + if valid_data: # we lost and added data, we cannot regenerate. + generated_record_keyvals = format_key_value_pairs(result) + check_kv_pair_records(record_keyvals, generated_record_keyvals) + + +def input_get_time(): + deck = [ + # format, line ts, valid + (LOG_REGEX.LOG_FORMAT_PBS, "03/24/2011 13:43:28", 1300974208.0), + (LOG_REGEX.LOG_FORMAT_PBS_US, "03/24/2011 13:43:28.012345", 1300974208.012345), + ] + return deck + + +@pytest.mark.parametrize("fmt,line_ts,valid", input_get_time()) +def test_get_time(fmt, line_ts, valid): + result = get_time(line_ts, fmt) + assert result == valid + + +def input_hms_to_seconds(): + deck = [ + ("0:29:00", 1740), + ("-1:52:00", 0), # fixme: why??? + ("03:58:00", 14280), + ("154:00:00", 554400), + ("154:00", 9240), + ("0:29", 29), + ("1:16:00", 4560), + ("0:01:45", 105), + ] + return deck + + +@pytest.mark.parametrize("walltime,correct_result", input_hms_to_seconds()) +def test_hms_to_seconds(walltime, correct_result): + result = hms_to_seconds(walltime) + assert result == correct_result, "RESULT:%s CORRECT RESULT:%s" % (result, correct_result) + result_seconds = hms_to_seconds(seconds_to_hms(result)) + assert result_seconds == result + + +def test_fix_newlines_extract_time_00(): + lines = [ + "05/02/2014 20:27:02;E;257928;Exit_status=0 Resource_List.ncpus=65536 Resource_List.nodect=4096 " + "Resource_List.walltime=0:29:00 account=SSSPPg args=-g,-t,256k,-i,20,-p,-c,-w,-b,256k,-a,MPIIO ctime=1399061907.45 " + "cwd=/fs/somedirectoryobj3 end=1399062422.19 etime=1399061907.45 " + "exe=/fs/somedirectoryobj2" + "-bridgeringagg-automated-nodebug exec_host=MIR-04C00-77FF1-4096 group=unknown jobname=N/A mode=c16 qtime=1399061907.45 " + "queue=somequeue09 resources_used.location=MIR-04C00-77FF1-4096 resources_used.nodect=4096 resources_used.walltime=0:04:30 " + "session=unknown start=1399062151.69 user=userg", + "05/02/2014 20:27:35;S;256107;Resource_List.ncpus=1024 Resource_List.nodect=1024 Resource_List.walltime=6:00:00 " + "account=someaccount2 args= ctime=1398890600.49 " + "cwd=/fs/somedirectoryobj " + "etime=1398890600.49 exe=/fs/somedirectoryobj4" + ".0_ep_30.0/run_ensemble_job.292 exec_host=MIR-48800-7BB71-1-1024 group=unknown jobname=npt2-job292 mode=script " + "qtime=1398890600.49 queue=somequeue09 session=unknown start=1399062455.04 user=someuser93", + "05/02/2014 20:28:11;S;257828;Resource_List.ncpus=32768 Resource_List.nodect=2048 Resource_List.walltime=4:00:00 " + "account=someproject2 args=outFoamx_SecXY.input", + "ctime=1399056664.55 cwd=/fs/somedirectoryobjE etime=1399056664.55 " + "exe=/fs/somedirectoryobj6 exec_host=MIR-08000-3B3F1-2048 " + "group=unknown jobname=N/A mode=c16 qtime=1399056664.55 queue=somequeue09 session=unknown start=1399062491.92 user=useri", + "05/02/2014 20:28:43;S;257924;Resource_List.ncpus=1024 Resource_List.nodect=1024 Resource_List.walltime=0:40:00 " + "account=LiAirBattery args= ctime=1399061768.84 cwd=/fs/somedirectoryobjG " + "etime=1399061768.84 exe=/fs/somedirectoryobj5 " + "exec_host=MIR-48000-7B371-1-1024 group=unknown jobname=N/A mode=script qtime=1399061768.84 queue=somequeue09 " + "session=unknown start=1399062523.88 user=userj", + "05/02/2014 20:29:19;S;257931;Resource_List.ncpus=65536 Resource_List.nodect=2048 Resource_List.walltime=2:00:00 " + "account=Ray_Benard args= ctime=1399062080.19 cwd=/fs/somedirectoryobjF etime=1399062080.19 " + "exe=/fs/somedirectoryobj7 exec_host=MIR-48400-7B7F1-2048 group=unknown jobname=N/A mode=c32 " + "qtime=1399062080.19 queue=somequeue09 session=unknown start=1399062559.5 user=userk", + "05/02/2014 20:29:39;Q;257933;queue=somequeue64", + ] + parsed = [] + times_and_data = fix_newlines_extract_time(lines, version=2) + assert len(times_and_data) == len(lines) - 1 + for logtime, data in times_and_data: + parsed_line = parse_pbs_log_line(logtime, data) + parsed.append(parsed_line) + assert len(parsed) == len(lines) - 1 + + +def test_fix_newlines_extract_time_01(): + lines = [ + "05/02/2014 20:27:02;E;257928;Exit_status=0 Resource_List.ncpus=65536 Resource_List.nodect=4096 " + "Resource_List.walltime=0:29:00 account=SSSPPg args=-g,-t,256k,-i,20,-p,-c,-w,-b,256k,-a,MPIIO ctime=1399061907.45 " + "cwd=/fs/somedirectoryobj3 end=1399062422.19 etime=1399061907.45 " + "exe=/fs/somedirectoryobj2" + "-bridgeringagg-automated-nodebug exec_host=MIR-04C00-77FF1-4096 group=unknown jobname=N/A mode=c16 qtime=1399061907.45 " + "queue=somequeue09 resources_used.location=MIR-04C00-77FF1-4096 resources_used.nodect=4096 resources_used.walltime=0:04:30 " + "session=unknown start=1399062151.69 user=userg", + "05/02/2014 20:27:35;S;256107;Resource_List.ncpus=1024 Resource_List.nodect=1024 Resource_List.walltime=6:00:00 " + "account=someaccount2 args= ctime=1398890600.49 " + "cwd=/fs/somedirectoryobj " + "etime=1398890600.49 exe=/fs/somedirectoryobj4" + ".0_ep_30.0/run_ensemble_job.292 exec_host=MIR-48800-7BB71-1-1024 group=unknown jobname=npt2-job292 mode=script " + "qtime=1398890600.49 queue=somequeue09 session=unknown start=1399062455.04 user=someuser93", + "05/02/2014 20:28:11;S;257828;Resource_List.ncpus=32768 Resource_List.nodect=2048 Resource_List.walltime=4:00:00 " + "account=someproject2 args=outFoamx_SecXY.input", + "ctime=1399056664.55 cwd=/fs/somedirectoryobjE etime=1399056664.55 " + "exe=/fs/somedirectoryobj6 exec_host=MIR-08000-3B3F1-2048 " + "group=unknown jobname=N/A mode=c16 qtime=1399056664.55 queue=somequeue09 session=unknown start=1399062491.92 user=useri", + "05/02/2014 20:28:43;S;257924;Resource_List.ncpus=1024 Resource_List.nodect=1024 Resource_List.walltime=0:40:00 " + "account=LiAirBattery args= ctime=1399061768.84 cwd=/fs/somedirectoryobjG " + "etime=1399061768.84 exe=/fs/somedirectoryobj5 " + "exec_host=MIR-48000-7B371-1-1024 group=unknown jobname=N/A mode=script qtime=1399061768.84 queue=somequeue09 " + "session=unknown start=1399062523.88 user=userj", + "05/02/2014 20:29:19;S;257931;Resource_List.ncpus=65536 Resource_List.nodect=2048 Resource_List.walltime=2:00:00 " + "account=Ray_Benard args= ctime=1399062080.19 cwd=/fs/somedirectoryobjF etime=1399062080.19 " + "exe=/fs/somedirectoryobj7 exec_host=MIR-48400-7B7F1-2048 group=unknown jobname=N/A mode=c32 " + "qtime=1399062080.19 queue=somequeue09 session=unknown start=1399062559.5 user=userk", + "05/02/2014 20:29:39;Q;257933;queue=somequeue64", + ] + correct_lineno = [1, 2, 3, 5, 6, 7] + parsed = [] + times_and_data = fix_newlines_extract_time(lines, include_lineno=True, version=2) + assert len(times_and_data) == len(lines) - 1 + for idx, (logtime, lineno, data) in enumerate(times_and_data): + parsed_line = parse_pbs_log_line(logtime, data) + parsed.append(parsed_line) + assert lineno == correct_lineno[idx] + assert len(parsed) == len(lines) - 1 + + +def test_fix_newlines_extract_time_02(): + """Fix for PBS_Utils.pbs_accounting.BadLineError""" + lines = [ + "10/26/2021 16:24:55;D;52.pdw-s1;requestor=someuser73@pdw-s1.pdw.local", + "10/26/2021 16:24:55;A;53.pdw-s1;Job deleted as result of dependency on job 52.pdw-s1", + ] + parsed = [] + times_and_data = fix_newlines_extract_time(lines) + for logtime, data in times_and_data: + parsed_line = parse_pbs_log_line(logtime, data) + parsed.append(parsed_line) + assert len(parsed) == len(lines) + + +def test_fix_newlines_extract_time_03(): + """an interesting a record.""" + lines = [ + r"07/17/2024 18:07:24;a;1624.somemachine;Variable_List=PBS_O_HOME=/fs/someuser," + "PBS_O_LANG=en_US.UTF-8,PBS_O_LOGNAME=someuser," + "PBS_O_PATH=/fs/somedirectoryobj12:/fs/somedirectoryobj13:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr," + "PBS_O_SHELL=/bin/bash,PBS_O_HOST=login-01,PBS_O_WORKDIR=/fs/somedirectoryobj11," + "PBS_O_SYSTEM=Linux,SHELL=/bin/bash,HISTCONTROL=ignoredups," + "LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;37;41:su=37" + ";41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31" + ":*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31" + ":*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31" + ":*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31" + ":*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31" + ":*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01" + ";35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx" + "=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35" + ":*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35" + ":*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35" + ":*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.m4a=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36" + ":*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.oga=01;36:*.opus=01;36:*.spx=01;36:*.xspf=01;36:," + "LMOD_SETTARG_FULL_SUPPORT=no,CONDA_PROMPT_MODIFIER=(base),,LMOD_VERSION=8.7.32," + 'SSH_TTY=/dev/pts/15,BASH_FUNC_ml%%=() { eval "$($LMOD_DIR/ml_cmd "$@")",', + r"},BASH_FUNC_which%%=() { ( alias; eval ${which_declare} ) | /usr/bin/which --tty-only --read-alias --read-functions " + r"--show-tilde --show-dot $@", + r"},BASH_FUNC_module%%=() { if [ -z \"${LMOD_SH_DBG_ON+x}\" ]; then", + r" case \"$-\" in ", + r" *v*x*)", + r" __lmod_sh_dbg=\'vx\'", + r" ;; *v*)" "", + r" __lmod_sh_dbg=\'v\'", + r" ;; *x*)", + r" __lmod_sh_dbg=\'x\'", + r" ;; esac; fi; if [ -n \"${__lmod_sh_dbg:-}\" ]; then", + r"set +$__lmod_sh_dbg; echo \"Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod\'s output\" 1>&2; " + r"fi; eval \"$($LMOD_CMD shel", + ] + parsed = [] + times_and_data = fix_newlines_extract_time(lines, include_lineno=True) + print(times_and_data) + assert len(times_and_data) == 1 + for idx, (logtime, lineno, data) in enumerate(times_and_data): + print(f"{idx=} {logtime=} {data=}") + parsed_line = parse_pbs_log_line(logtime, data) + print(f" *** {parsed_line=} ***") + parsed.append(parsed_line) + + +def input_pbs_accounting_place(ids=False) -> List[Tuple[str, str, list, bool]]: + # comment, input, output, valid + deck = [ + ("", "excl", {"arrangement": None, "sharing": "excl", "groups": []}, True), + ("", "exclhost", {"arrangement": None, "sharing": "exclhost", "groups": []}, True), + ("", "free", {"arrangement": "free", "sharing": None, "groups": []}, True), + ("", "group=somegroup1", {"arrangement": None, "sharing": None, "groups": ["somegroup1"]}, True), + ("", "pack", {"arrangement": "pack", "sharing": None, "groups": []}, True), + ("", "scatter", {"arrangement": "scatter", "sharing": None, "groups": []}, True), + ("", "scatter:excl", {"arrangement": "scatter", "sharing": "excl", "groups": []}, True), + ("", "scatter:exclhost", {"arrangement": "scatter", "sharing": "exclhost", "groups": []}, True), + ("", "scatter:group=somegroup0", {"arrangement": "scatter", "sharing": None, "groups": ["somegroup0"]}, True), + ("", "vscatter", {"arrangement": "vscatter", "sharing": None, "groups": []}, True), + ( + "", + "scatter:group=somegroup4:group=somegroup8", + {"arrangement": "scatter", "sharing": None, "groups": ["somegroup4", "somegroup8"]}, + True, + ), + ("", "scatter:shared:group=somegroup8", {"arrangement": "scatter", "sharing": "shared", "groups": ["somegroup8"]}, True), + ("", "", {}, True), + ] + if ids: + deck = [f"{i:0>2}" for i, _ in enumerate(deck)] + return deck + + +@pytest.mark.parametrize("comment,value,correct,valid", input_pbs_accounting_place(), ids=input_pbs_accounting_place(ids=True)) +def test_parse_place(comment, value, correct, valid): + result = cast_to_place_dict(value) + # print(f'("{comment}", "{value}", {result}, {valid}),') + if valid: + assert result == correct, value + else: + assert result != correct, value + new_value = join_place_dict(result) + # we cannot compare directly because dictionaries are not ordered, convert again + new_result = cast_to_place_dict(new_value) + if valid: + assert result == new_result, value + else: + assert result != new_result, value + + +def input_pbs_accounting_select(ids=False) -> List[Tuple[str, str, list, bool]]: + # comment, input, output, valid + deck = [ + ("", "128:ngpus=4", [{"N": 128, "ngpus": "4"}], True), + ( + "", + "128:ncpus=64:ngpus=4:gputype=A100+32:ncpus=64:ngpus=2:gputype=A40", + [{"N": 128, "ncpus": "64", "ngpus": "4", "gputype": "A100"}, {"N": 32, "ncpus": "64", "ngpus": "2", "gputype": "A40"}], + True, + ), + ("", "128:ngpus=4:system=somemachine4", [{"N": 128, "ngpus": "4", "system": "somemachine4"}], True), + ("", "128:system=somemachine4", [{"N": 128, "system": "somemachine4"}], True), + ( + "", + "128:system=somemachine4:ncpus=32:ngpus=4", + [{"N": 128, "system": "somemachine4", "ncpus": "32", "ngpus": "4"}], + True, + ), + ( + "", + "128:system=somemachine4:vnode=^x3008c0s13b0n0", + [{"N": 128, "system": "somemachine4", "vnode": "^x3008c0s13b0n0"}], + True, + ), + ( + "", + "128:system=somemachine4:vnode=^x3016c0s19b0n0", + [{"N": 128, "system": "somemachine4", "vnode": "^x3016c0s19b0n0"}], + True, + ), + ( + "", + "128:system=somemachine4:vnode=^x3108c0s19b1n0", + [{"N": 128, "system": "somemachine4", "vnode": "^x3108c0s19b1n0"}], + True, + ), + ( + "", + "128:vnode=^x3206c0s37b0n0+1:vnode=^x3202c0s31b1n0", + [{"N": 128, "vnode": "^x3206c0s37b0n0"}, {"N": 1, "vnode": "^x3202c0s31b1n0"}], + True, + ), + ("", "129:ncpus=64:ngpus=4", [{"N": 129, "ncpus": "64", "ngpus": "4"}], True), + ("", "12:ncpus=12", [{"N": 12, "ncpus": "12"}], True), + ("", "12:ncpus=32:ngpus=4", [{"N": 12, "ncpus": "32", "ngpus": "4"}], True), + ("", "12:ncpus=64:ngpus=4", [{"N": 12, "ncpus": "64", "ngpus": "4"}], True), + ("", "12:system=somemachine4", [{"N": 12, "system": "somemachine4"}], True), + ("", "12:system=somemachine4:ncpus=32:ngpus=4", [{"N": 12, "system": "somemachine4", "ncpus": "32", "ngpus": "4"}], True), + ("", "130:ncpus=64:ngpus=4", [{"N": 130, "ncpus": "64", "ngpus": "4"}], True), + ( + "", + "16:ncpus=64:build=False:debug=False:demand=False", + [{"N": 16, "ncpus": "64", "build": "False", "debug": "False", "demand": "False"}], + True, + ), + ("", "1:ncpus=1:system=generic_pbs", [{"N": 1, "ncpus": "1", "system": "generic_pbs"}], True), + ( + "", + "1:ncpus=1:obj_type=node:system=generic_pbs", + [{"N": 1, "ncpus": "1", "obj_type": "node", "system": "generic_pbs"}], + True, + ), + ("", "host=x3212c0s7b1n0:ncpus=64", [{"host": "x3212c0s7b1n0", "ncpus": "64"}], True), + ("", "ngpus=1", [{"ngpus": "1"}], True), + ("", "vnode=16", [{"vnode": "16"}], True), + ("", "", [], True), + ("", "vnode=1:system=somemachine4", [{"vnode": "1", "system": "somemachine4"}], True), + ( + "", + '1:ncpus=1:obj_type="bl:ck":system=generic_pbs', + [{"N": 1, "ncpus": "1", "obj_type": "bl:ck", "system": "generic_pbs"}], + True, + ), + ( + "", + '1:ncpus=1:obj_type="bl:ck":system="gener+ic_pbs"', + [{"N": 1, "ncpus": "1", "obj_type": "bl:ck", "system": "gener+ic_pbs"}], + True, + ), + ("", "1:vnode=^p0015-01+2:vnode=^p0001-01", [{"N": 1, "vnode": "^p0015-01"}, {"N": 2, "vnode": "^p0001-01"}], True), + ( + "", + '1:ncpus=1:obj_type="bl\\:o\\+ck2":system=dog', + [{"N": 1, "ncpus": "1", "obj_type": "bl\\:o\\+ck2", "system": "dog"}], + True, + ), + ("", "2:ncpus=2:mpiprocs=2+1:nfpus=4", [{"N": 2, "ncpus": "2", "mpiprocs": "2"}, {"N": 1, "nfpus": "4"}], True), + ( + "", + '1:ncpus=1:obj_type="node+4:obj_type=block":system="gener:ic_pbs":tier_16="fun+fun"', + [{"N": 1, "ncpus": "1", "obj_type": "node+4:obj_type=block", "system": "gener:ic_pbs", "tier_16": "fun+fun"}], + True, + ), + ( + "", + '1:ncpus=1:obj_type="bl:ck+arch=linux+host=gc-svr":system="gener+ic_pbs"', + [{"N": 1, "ncpus": "1", "obj_type": "bl:ck+arch=linux+host=gc-svr", "system": "gener+ic_pbs"}], + True, + ), + ] + if ids: + deck = [f"{i:0>2}" for i, _ in enumerate(deck)] + return deck + + +@pytest.mark.parametrize("comment,value,correct,valid", input_pbs_accounting_select(), ids=input_pbs_accounting_select(ids=True)) +def test_parse_select(comment, value, correct, valid): + result = cast_to_select_list(value) + # print(f"('{comment}', '{value}', {result}, {valid}),") + if valid: + assert result == correct + else: + assert result != correct + new_value = join_select_list(result) + # we cannot compare directly because dictionaries are not ordered, convert again + new_result = cast_to_select_list(new_value) + if valid: + assert result == new_result, value + else: + assert result != new_result, value + + +def input_create_pbs_log_line_from_record(ids=False): + deck = [ + ("", (datetime.now(), RecType.X, "1234", {}), "", True), + ("", (datetime.now(), RecType.X, "1234", {}), "", True), + ("", (datetime.now(), RecType.X, "1234", {"a": "1", "b": "2", "c": "3"}), "", True), + ("", (datetime.now(), RecType.X, "1234", {"a": 1, "b": 2, "c": 3}), "", False), + ] + if ids: + deck = [f"{i:0>2}" for i, _ in enumerate(deck)] + return deck + + +@pytest.mark.parametrize( + "comment,value,correct,valid", input_create_pbs_log_line_from_record(), ids=input_create_pbs_log_line_from_record(ids=True) +) +def test_create_pbs_log_line_from_record_00(comment, value, correct, valid): + timestamp, record_type, identifier, dct = value + edct = enhance_pbs_record(timestamp, record_type, identifier, dct=dct) + record_obj = create_pbs_record(timestamp, record_type, identifier, dct) + line = create_pbs_log_line_from_record(record_obj, attempt_str=True, iso8601=True) + logtime, data = pbs_line_extract_time(line, iso8601=True) + pbs_dct = parse_pbs_log_line(logtime, data) + if valid: + assert edct == pbs_dct + else: + assert edct != pbs_dct + + +@pytest.mark.parametrize( + "comment,line,valid_record,valid_data", input_parse_pbs_log_line(), ids=input_parse_pbs_log_line(ids=True) +) +def test_create_pbs_log_line_from_record_01(comment, line, valid_record, valid_data): + logtime, data = pbs_line_extract_time(line) + if valid_record == BadLineError: + with pytest.raises(BadLineError): + parse_pbs_log_line(logtime, data) + elif valid_record == Exception: + with pytest.raises(Exception): + parse_pbs_log_line(logtime, data) + else: + result = parse_pbs_log_line(logtime, data) + pprint(result, width=132) + assert result == valid_record, pformat(result, width=132) + record_type = result["record"] + identifier = result["identifier"] + record_obj = create_pbs_record(logtime, record_type, identifier, result) + new_line = create_pbs_log_line_from_record(record_obj, attempt_str=True, iso8601=False) + logtime_new, data_new = pbs_line_extract_time(new_line) + new_result = parse_pbs_log_line(logtime_new, data_new) + assert result == new_result + + +def input_reservations(ids=False): + """ + Test of PBS reservation record. + What are nodes=? What other data are we missing. Pull all somemachine4 and somemachine3 data here. + """ + deck = [ + ( + ( + "04/04/2023 13:21:00;B;someresv43.somemachine4;owner=user1234@somemachine4-login-01 name=NULL queue=someresv43 " + "ctime=1680614422 start=1680614460 end=1680650460 duration=36000 nodes=(x3013c0s7b0n0:ncpus=64) " + "Authorized_Users=user1234@somemachine4-login-01 Resource_List.allow_account_check_failure=True " + "Resource_List.allow_negative_allocation=True Resource_List.backfill_factor=84600 " + "Resource_List.backfill_max=50 Resource_List.base_score=0 Resource_List.enable_backfill=0 " + "Resource_List.enable_fifo=0 Resource_List.enable_wfp=0 Resource_List.fifo_factor=1800 " + "Resource_List.ncpus=64 Resource_List.ni_resource=somemachine4 Resource_List.preempt_targets=NONE " + "Resource_List.score_boost=0 Resource_List.total_cpus=560 Resource_List.wfp_factor=100000 " + "Resource_List.nodect=1 Resource_List.select=host=x3013c0s7b0n0:ncpus=64 Resource_List.place=exclhost " + "Resource_List.walltime=10:00:00" + ), + 28, + 33, + { + "Resource_List.allow_account_check_failure": "True", + "Resource_List.allow_negative_allocation": "True", + "Resource_List.backfill_factor": "84600", + "Resource_List.backfill_max": "50", + "Resource_List.base_score": "0", + "Resource_List.enable_backfill": "0", + "Resource_List.enable_fifo": "0", + "Resource_List.enable_wfp": "0", + "Resource_List.fifo_factor": "1800", + "Resource_List.ncpus": 64, + "Resource_List.ni_resource": "somemachine4", + "Resource_List.nodect": 1, + "Resource_List.place": {"arrangement": None, "groups": [], "sharing": "exclhost"}, + "Resource_List.preempt_targets": "NONE", + "Resource_List.score_boost": "0", + "Resource_List.select": [{"host": "x3013c0s7b0n0", "ncpus": "64"}], + "Resource_List.total_cpus": "560", + "Resource_List.walltime": 36000, + "Resource_List.wfp_factor": "100000", + "action": "begin", + "Authorized_Users": ["user1234@somemachine4-login-01"], + "ctime": datetime(2023, 4, 4, 13, 20, 22), + "duration": 36000, + "end": datetime(2023, 4, 4, 23, 21), + "event_type": "RESERVATION", + "identifier": "someresv43.somemachine4", + "name": "NULL", + "queue": "someresv43", + "record": "B", + "scheduler_timestamp": datetime(2023, 4, 4, 13, 21), + "start": datetime(2023, 4, 4, 13, 21), + "nodes": [ + "x3013c0s7b0n0", + ], + "owner": "user1234@somemachine4-login-01", + }, + [], + ), + ( + ( + "01/21/2023 22:00:00;B;someresv23.pbsserver;owner=someadmin@somehost" + " name=ResvName queue=someresv23 ctime=1674250622 start=1674338400 end=1674424800 " + "duration=86400 nodes=(x3006c0s13b1n0:ncpus=64)+(x3006c0s19b0n0:ncpus=64)+(x3006c0s19b1n0:ncpus=64)+(" + "x3006c0s1b0n0:ncpus=64)+(x3006c0s1b1n0:ncpus=64)+(x3006c0s25b0n0:ncpus=64)+(x3006c0s25b1n0:ncpus=64)+(" + "x3006c0s31b0n0:ncpus=64)+(x3006c0s31b1n0:ncpus=64)+(x3006c0s37b0n0:ncpus=64)+(x3006c0s37b1n0:ncpus=64)+(" + "x3006c0s7b0n0:ncpus=64)+(x3006c0s7b1n0:ncpus=64)+(x3007c0s13b0n0:ncpus=64)+(x3007c0s13b1n0:ncpus=64)+(" + "x3007c0s19b0n0:ncpus=64)+(x3007c0s19b1n0:ncpus=64)+(x3007c0s1b0n0:ncpus=64)+(x3007c0s1b1n0:ncpus=64)+(" + "x3007c0s25b0n0:ncpus=64)+(x3007c0s25b1n0:ncpus=64)+(x3007c0s31b0n0:ncpus=64)+(x3007c0s31b1n0:ncpus=64)+(" + "x3007c0s37b0n0:ncpus=64)+(x3007c0s37b1n0:ncpus=64)+(x3007c0s7b0n0:ncpus=64)+(x3007c0s7b1n0:ncpus=64)+(" + "x3008c0s13b0n0:ncpus=64)+(x3008c0s13b1n0:ncpus=64)+(x3008c0s19b0n0:ncpus=64)+(x3008c0s19b1n0:ncpus=64)+(" + "x3008c0s1b0n0:ncpus=64)+(x3008c0s1b1n0:ncpus=64)+(x3008c0s25b0n0:ncpus=64)+(x3008c0s25b1n0:ncpus=64)+(" + "x3008c0s31b0n0:ncpus=64)+(x3008c0s31b1n0:ncpus=64)+(x3008c0s37b0n0:ncpus=64)+(x3008c0s37b1n0:ncpus=64)+(" + "x3008c0s7b0n0:ncpus=64)+(x3008c0s7b1n0:ncpus=64)+(x3009c0s13b0n0:ncpus=64)+(x3009c0s13b1n0:ncpus=64)+(" + "x3009c0s19b0n0:ncpus=64)+(x3009c0s19b1n0:ncpus=64)+(x3009c0s1b0n0:ncpus=64)+(x3009c0s1b1n0:ncpus=64)+(" + "x3009c0s25b0n0:ncpus=64)+(x3009c0s25b1n0:ncpus=64)+(x3009c0s31b0n0:ncpus=64)+(x3009c0s31b1n0:ncpus=64)+(" + "x3009c0s37b0n0:ncpus=64)+(x3009c0s37b1n0:ncpus=64)+(x3009c0s7b0n0:ncpus=64)+(x3009c0s7b1n0:ncpus=64)+(" + "x3010c0s13b0n0:ncpus=64)+(x3010c0s13b1n0:ncpus=64)+(x3010c0s19b0n0:ncpus=64)+(x3010c0s19b1n0:ncpus=64)+(" + "x3010c0s1b0n0:ncpus=64) Authorized_Users=someadmin@somehost " + "Resource_List.allow_account_check_failure=True Resource_List.allow_negative_allocation=True " + "Resource_List.backfill_factor=84600 Resource_List.backfill_max=50 Resource_List.base_score=0 " + "Resource_List.enable_backfill=0 Resource_List.enable_fifo=0 Resource_List.enable_wfp=0 " + "Resource_List.fifo_factor=1800 Resource_List.ncpus=3840 Resource_List.ni_resource=somemachine4 " + "Resource_List.preempt_targets=NONE Resource_List.score_boost=0 Resource_List.total_cpus=560 " + "Resource_List.wfp_factor=100000 Resource_List.nodect=60 Resource_List.select=60 Resource_List.place=free " + "Resource_List.walltime=24:00:00" + ), + 28, + 33, + { + "Authorized_Users": ["someadmin@somehost"], + "Resource_List.allow_account_check_failure": "True", + "Resource_List.allow_negative_allocation": "True", + "Resource_List.backfill_factor": "84600", + "Resource_List.backfill_max": "50", + "Resource_List.base_score": "0", + "Resource_List.enable_backfill": "0", + "Resource_List.enable_fifo": "0", + "Resource_List.enable_wfp": "0", + "Resource_List.fifo_factor": "1800", + "Resource_List.ncpus": 3840, + "Resource_List.ni_resource": "somemachine4", + "Resource_List.nodect": 60, + "Resource_List.place": {"arrangement": "free", "groups": [], "sharing": None}, + "Resource_List.preempt_targets": "NONE", + "Resource_List.score_boost": "0", + "Resource_List.select": [{"N": 60}], + "Resource_List.total_cpus": "560", + "Resource_List.walltime": 86400, + "Resource_List.wfp_factor": "100000", + "action": "begin", + "ctime": datetime(2023, 1, 20, 21, 37, 2), + "duration": 86400, + "end": datetime(2023, 1, 22, 22, 0), + "event_type": "RESERVATION", + "identifier": "someresv23.pbsserver", + "name": "ResvName", + "nodes": [ + "x3006c0s13b1n0", + "x3006c0s19b0n0", + "x3006c0s19b1n0", + "x3006c0s1b0n0", + "x3006c0s1b1n0", + "x3006c0s25b0n0", + "x3006c0s25b1n0", + "x3006c0s31b0n0", + "x3006c0s31b1n0", + "x3006c0s37b0n0", + "x3006c0s37b1n0", + "x3006c0s7b0n0", + "x3006c0s7b1n0", + "x3007c0s13b0n0", + "x3007c0s13b1n0", + "x3007c0s19b0n0", + "x3007c0s19b1n0", + "x3007c0s1b0n0", + "x3007c0s1b1n0", + "x3007c0s25b0n0", + "x3007c0s25b1n0", + "x3007c0s31b0n0", + "x3007c0s31b1n0", + "x3007c0s37b0n0", + "x3007c0s37b1n0", + "x3007c0s7b0n0", + "x3007c0s7b1n0", + "x3008c0s13b0n0", + "x3008c0s13b1n0", + "x3008c0s19b0n0", + "x3008c0s19b1n0", + "x3008c0s1b0n0", + "x3008c0s1b1n0", + "x3008c0s25b0n0", + "x3008c0s25b1n0", + "x3008c0s31b0n0", + "x3008c0s31b1n0", + "x3008c0s37b0n0", + "x3008c0s37b1n0", + "x3008c0s7b0n0", + "x3008c0s7b1n0", + "x3009c0s13b0n0", + "x3009c0s13b1n0", + "x3009c0s19b0n0", + "x3009c0s19b1n0", + "x3009c0s1b0n0", + "x3009c0s1b1n0", + "x3009c0s25b0n0", + "x3009c0s25b1n0", + "x3009c0s31b0n0", + "x3009c0s31b1n0", + "x3009c0s37b0n0", + "x3009c0s37b1n0", + "x3009c0s7b0n0", + "x3009c0s7b1n0", + "x3010c0s13b0n0", + "x3010c0s13b1n0", + "x3010c0s19b0n0", + "x3010c0s19b1n0", + "x3010c0s1b0n0", + ], + "queue": "someresv23", + "record": "B", + "scheduler_timestamp": datetime(2023, 1, 21, 22, 0), + "start": datetime(2023, 1, 21, 22, 0), + "owner": "someadmin@somehost", + }, + [], + ), + ] + if ids: + deck = [f"{i:0>2}" for i, _ in enumerate(deck)] + return deck + + +@pytest.mark.parametrize( + "line,len_key_value_dct,len_parse_dct,valid,valid_missing", input_reservations(), ids=input_reservations(ids=True) +) +def test_parse_pbs_reservations(line, len_key_value_dct, len_parse_dct, valid, valid_missing): + logtime, data = pbs_line_extract_time(line) + try: + parse_dct = parse_pbs_log_line(logtime, data) + except Parse_Error_PBS as e: + parse_dct = {"EXCEPTION": e} + record_type, identifier, record_keyvals = split_record(data) + key_value_dct = parse_key_value_pairs(record_keyvals) + missing_fields = [] + for field in key_value_dct.keys(): + try: + parse_dct[field] + except (KeyError, TypeError) as e: + if field != "active_id": + missing_fields.append(field) + assert len_key_value_dct == len(key_value_dct) + assert parse_dct == valid + assert missing_fields == valid_missing + assert parse_dct["ctime"] <= parse_dct["scheduler_timestamp"] + if parse_dct["record"] == "K": + assert parse_dct["stime"] <= parse_dct["etime"] + assert parse_dct["stime"] <= parse_dct["scheduler_timestamp"] + assert parse_dct["etime"] <= parse_dct["scheduler_timestamp"] diff --git a/tests/test_pbs_instrument.py b/tests/test_pbs_instrument.py new file mode 100644 index 0000000000000000000000000000000000000000..de932535416f4cc3bb33ce8a09d47fecd79782af --- /dev/null +++ b/tests/test_pbs_instrument.py @@ -0,0 +1,107 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import os +import time +import pytest +import random +from pprint import pformat +from _pytest.assertion import truncate + +from PBS_Utils.pbs_instrument import instrument_setup, TracerSimple, ot_decorate_span, get_tracer, TracerFake + +truncate.DEFAULT_MAX_LINES = 8192 +truncate.DEFAULT_MAX_CHARS = 8192 + + +@pytest.mark.skipif(os.geteuid() == 0, reason="test requires a valid user, not 0") +def test_Tracer_OT_00(): + pytest.importorskip("opentelemetry", reason="opentelemetry not found.") + output_path = "/tmp/ot_example.jsonl" + tracer = None + + @ot_decorate_span(tracer) + def do_something_05(a, b): + return [random.random() for _ in range(int(a * b))] + + def do_something_06(a, b): + return [random.random() for _ in range(int(a * b))] + + do_something_05(100, 100) + do_something_06(100, 100) + + # will not setup a tracer + with instrument_setup(None, output_path) as tracer: + do_something_05(100, 100) + ot_decorate_span(tracer)(do_something_06)(10, 100) + + # do not start another tracer after this. + with instrument_setup("example", output_path) as tracer: + + @ot_decorate_span(tracer) + def do_something_00(a: float, b: float): + return [random.random() for _ in range(int(a * b))] + + def do_something_01(a: float, b: float): + return [random.random() for _ in range(int(a * b))] + + @ot_decorate_span(tracer) + def do_something_02(): + raise ValueError("Boom!") + + def do_something_03(): + raise ValueError("Boom!") + + @ot_decorate_span(tracer) + def do_something_04(): + do_something_00(10, 100) + do_something_00(100, 100) + ot_decorate_span(tracer)(do_something_01)(10, 100) + ot_decorate_span(tracer)(do_something_01)(100, 100) + + do_something_00(10, 100) + do_something_00(100, 100) + ot_decorate_span(tracer)(do_something_01)(10, 100) + ot_decorate_span(tracer)(do_something_01)(100, 100) + try: + do_something_02() + except: + pass + + try: + ot_decorate_span(tracer)(do_something_03)() + except: + pass + + do_something_04() + + do_something_04() + + assert os.path.exists(output_path) + + +@pytest.mark.skipif(os.geteuid() == 0, reason="test requires a valid user, not 0") +def test_TracerFake_00(): + pytest.importorskip("opentelemetry", reason="opentelemetry not found.") + with instrument_setup("test_00", "/tmp/tracer_fake_00.jsonl", TracerFake) as tracer: + with tracer.start_as_current_span("a"): + a = 1 + with tracer.start_as_current_span("b"): + b = a + assert a == b + + +# @pytest.importorskip('opentelemetry', reason='opentelemetry not found.') +@pytest.mark.skipif(os.geteuid() == 0, reason="test requires a valid user, not 0") +def test_TracerSimple_01(): + pytest.importorskip("opentelemetry", reason="opentelemetry not found.") + sleep_time = 0.1 + with instrument_setup("test_01", "/tmp/tracer_simple_01.jsonl", TracerSimple) as tracer: + with tracer.start_as_current_span("a"): + time.sleep(sleep_time) + with tracer.start_as_current_span("b"): + time.sleep(sleep_time) + assert len(tracer.traces) == 3 + print(pformat(tracer.traces)) + assert (tracer.td_code - (sleep_time * 2)) < 0.02, tracer.td_code - sleep_time diff --git a/tests/test_pbs_library.py b/tests/test_pbs_library.py new file mode 100644 index 0000000000000000000000000000000000000000..a5c5eba6fa4e0242089f1847a3be6c23acdceb7a --- /dev/null +++ b/tests/test_pbs_library.py @@ -0,0 +1,148 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import socket +import pytest +from PBS_Utils.pbs_library import PBSIFLInterface +import os + +try: + import pbs_ifl +except ImportError: + pbs_ifl = None + + +@pytest.mark.skipif(pbs_ifl is None, reason="could not import pbs_ifl") +class TestPBSLibrary: + @classmethod + def setup_class(cls): + machine_name = "pbs_generic" + pbs_host = "pdw-s1" + debug = True + cls.pbs_interface = PBSIFLInterface(machine_name=machine_name, host=pbs_host, debug=debug, pbs_ifl=pbs_ifl) + + @classmethod + def teardown_class(cls): + cls.pbs_interface.disconnect() + + @pytest.mark.skipif(os.geteuid() != 0, reason="test requires sudo") + def test_pbs_stathook(self): + data = self.pbs_interface.pbs_stathook("", None, None) + print(data) + + def test_pbs_stat_server_00(self): + server = self.pbs_interface.host + data = self.pbs_interface.pbs_statserver(server, "python_restart_max_hooks") + print(data) + + def test_get_pbs_vnodes(self): + data = self.pbs_interface.get_pbs_vnodes() + print(data) + + def test_pbs_manager(self): + self.pbs_interface.get_attropl("python_restart_max_hooks", value="1000000000") + data = self.pbs_interface.pbs_manager("set", "server") + print(data) + + def test_qstat(self): + data = self.pbs_interface.get_pbs_server() + print(data) + + def test_get_pbs_jobs(self): + """Assuming no jobs have been started (clean system)""" + data = self.pbs_interface.get_pbs_jobs() + assert len(data["data"]) == 0 + assert data["metadata"]["machine_name"] == "pbs_generic" + + def test_get_pbs_nodes(self): + """Assuming 4 node system""" + data = self.pbs_interface.get_pbs_nodes() + assert len(data["data"]) == 4 + assert data["metadata"]["machine_name"] == "pbs_generic" + + def test_get_pbs_reservations(self): + """Assuming no reservations present on system (clean system)""" + data = self.pbs_interface.get_pbs_reservations() + assert len(data["data"]) == 0 + assert data["metadata"]["machine_name"] == "pbs_generic" + + @pytest.mark.skipif(os.geteuid() == 0, reason="test requires a valid user, not 0") + def test_pbs_submit(self): + """Submitting job to pbs_ifl and checking""" + assert len(self.pbs_interface.get_pbs_jobs()["data"]) == 0 + + job_script = "tests/submit_script.sh" + attropl_list = [{"name": "Resource_List", "value": "1:ncpus=1", "resource": "select"}] + jobid = self.pbs_interface.pbs_submit(job_script, attropl_list) + assert len(jobid) > 1 + + assert len(self.pbs_interface.get_pbs_jobs()["data"]) == 1 + + # delete job and test + self.pbs_interface.pbs_delete(jobid, wait=True) + + # wait for job to disappear + assert len(self.pbs_interface.get_pbs_jobs()["data"]) == 0 + + # check for 0 + data = self.pbs_interface.get_pbs_jobs() + assert len(data["data"]) == 0 + + @pytest.mark.skipif(os.geteuid() == 0, reason="test requires a valid user, not 0") + def test_pbs_submit2(self): + """Submitting job to pbs_ifl and checking job data (also assuming clean)""" + assert len(self.pbs_interface.get_pbs_jobs()["data"]) == 0 + + data = self.pbs_interface.get_pbs_jobs() + assert len(data["data"]) == 0 + assert data["metadata"]["machine_name"] == "pbs_generic" + + # submitting job + job_script = "tests/submit_script.sh" + attropl_list = [{"name": "Resource_List", "value": "1:ncpus=1", "resource": "select"}] + joblist = [] + for i in range(2): + jobid = self.pbs_interface.pbs_submit(job_script, attropl_list) + assert len(jobid) > 0 + joblist.append(jobid) + + # recheck number of jobs + data = self.pbs_interface.get_pbs_jobs() + assert len(data["data"]) == 2 + assert data["metadata"]["machine_name"] == "pbs_generic" + for job in data["data"].keys(): + assert data["data"][job]["Resource_List"]["ncpus"] == "1" + + # kill the jobs + for jobid in joblist: + self.pbs_interface.pbs_delete(jobid, wait=True) + + # make sure all jobs are gone + assert len(self.pbs_interface.get_pbs_jobs()["data"]) == 0 + + @pytest.mark.skipif(pbs_ifl is None, reason="could not import pbs_ifl") + def test_get_pbs_queues_00(self): + result_dct = self.pbs_interface.get_pbs_queues() + queues_dct = result_dct["data"] + assert len(queues_dct) >= 0 + + @pytest.mark.skipif(pbs_ifl is None, reason="could not import pbs_ifl") + def test_get_pbs_server_00(self): + server = self.pbs_interface.get_pbs_server_host() + assert server == socket.gethostname() + + result_dct = self.pbs_interface.get_pbs_server() + server_dct = result_dct["data"] + assert len(server_dct) >= 0 + + attrl = self.pbs_interface.get_attrl("python_restart_max_hooks") + result_dct = self.pbs_interface.get_pbs_server(attrl=attrl) + server_dct = result_dct["data"][server] + assert len(server_dct) == 1 + + attrl_mh = self.pbs_interface.get_attrl("python_restart_max_hooks") + attrl_mo = self.pbs_interface.get_attrl("python_restart_max_objects", next_attrl=attrl_mh) + result_dct = self.pbs_interface.get_pbs_server(attrl=attrl_mo) + server_dct = result_dct["data"][server] + assert len(server_dct) == 2 diff --git a/tests/test_pbs_util.py b/tests/test_pbs_util.py new file mode 100644 index 0000000000000000000000000000000000000000..2177a8832b55fef5c55557a1505face2bdeefe00 --- /dev/null +++ b/tests/test_pbs_util.py @@ -0,0 +1,36 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +from PBS_Utils.pbs_util import get_now, get_now_day_str, datetime_to_str + + +def test_get_now_00(): + now = get_now() + now2 = get_now() + assert (now2 - now).total_seconds() < 1 + + now = get_now(notz=True) + now2 = get_now(notz=True) + assert (now2 - now).total_seconds() < 1 + assert now.tzinfo is None + assert now2.tzinfo is None + + now = get_now(notz=False) + now2 = get_now(notz=False) + assert (now2 - now).total_seconds() < 1 + assert now.tzinfo is not None + assert now2.tzinfo is not None + + +def test_get_now_day_str_00(): + now = get_now_day_str() + assert now is not None + assert len(now) == 8 + + +def test_datetime_to_str_00(): + now = get_now() + result = datetime_to_str(now) + assert result is not None + assert len(result) == 26 diff --git a/tests/test_plquery.py b/tests/test_plquery.py new file mode 100644 index 0000000000000000000000000000000000000000..0f7d435d5b75c2d324fb1d5efeca50ccd1f5adf9 --- /dev/null +++ b/tests/test_plquery.py @@ -0,0 +1,219 @@ +# Copyright (C) 2024, UChicago Argonne, LLC +# Licensed under the 3-clause BSD license. See accompanying LICENSE.txt file +# in the top-level directory. + +import os +import pytest +import traceback +from pprint import pformat +from click.testing import CliRunner +from PBS_Utils.scripts.plquery import cli + +try: + import pbs_ifl +except ImportError: + pbs_ifl = None +LOADDATA_DIR = os.path.join(os.path.dirname(__file__), "data/pldata") + + +def get_data_prefix(prefix_lst=None): + if prefix_lst is None: + prefix_lst = [ + ("-l",), # latest data + ("--data-prefix", "20231003T150529"), # 1 job in queue, pdw-c03 offline, 3 free, one reservation confirmed. + ("--data-prefix", "20231003T150639"), # no jobs, no reservations, all 4 nodes free. + ( + "--data-prefix", + "20231003T151028", + ), # 1 running job, one stuck job, pdw-c03 offline, 2 free, one resv confirmed, one resv running." + ] + call_lst = [ + ("job",), + ( + "--groupby", + "jobid", + "--groupby", + "job_state", + "--agg", + "vnode=count", + "job", + ), + ( + "--agg", + "isavailable=sum", + "avail", + ), + ( + "--filt", + "isavailable=1", + "avail", + ), + ( + "--filt", + "isavailable=1", + "--header", + "vnode", + "--bare", + "avail", + ), + ( + "--header", + "vnode,resources_available.ncpus", + "--filt", + "resources_available.ncpus=16", + "vnode", + ), + ( + "--header", + "vnode", + "--filt", + "resources_available.ncpus=16", + "--header", + "vnode", + "--bare", + "vnode", + ), + ( + "--header", + "vnode,resources_available.ncpus", + "--filt", + "resources_available.ncpus=0", + "vnode", + ), + ("queue-jobs-agg",), + ( + "-f", + "job_state=running", + "queue-jobs-agg", + ), + ("queue-avail",), + ( + "--filt", + "queue=default", + "queue-avail", + ), + ( + "--groupby", + "isavailable", + "--agg", + "isavailable=count", + "avail", + ), + ( + "--groupby", + "isavailable", + "--agg", + "isavailable=count,sum", + "avail", + ), + ( + "--filt", + "node_state=free", + "--filt", + "isavailable=0", + "avail", + ), + ( + "--filt", + "node_state=free", + "--filt", + "isavailable=0", + "--agg", + "vnode=count", + "avail", + ), + ( + "--filt", + "queue=somequeue", + "queue-avail", + ), + ( + "--filt", + "queue=somequeue", + "avail", + ), + ( + "--agg", + "isdown=count,sum", + "--filt", + "isdown=1", + "avail", + "--filt-in-or", + "node_state_lst=down,offline,state-unknown,Stale", + "--filt-in-or", + "isavailable=0,1", + ), + ("--header", "vnode", "--bare", "--bare-delim", "☃", "--filt", "queue=validation", "--filt", "isavailable=1", "avail"), + ( + "--filt", + "queue=workq", + "--filt-or", + "vnode=pdw-c02", + "--filt-or", + "vnode=pdw-c01", + "--filt-or", + "vnode=pdw-c03", + "avail", + ), + ( + "--filt", + "queue=workq", + "--filt", + "isavailable=1", + "--filt-or", + "vnode=pdw-c02", + "--filt-or", + "vnode=pdw-c01", + "--filt-or", + "vnode=pdw-c03", + "avail", + ), + ( + "job-special", + "--option", + "stuck", + ), + ( + "--header", + "jobid", + "--bare", + "job-special", + "--option", + "stuck", + ), + ] + test_lst = [] + for prefix in prefix_lst: + for call in call_lst: + test_lst.append([*prefix, *call]) + return test_lst + + +@pytest.mark.skipif(pbs_ifl is None, reason="could not import pbs_ifl") +def test_dump_ifl_00(tmp_path): + runner = CliRunner() + result = runner.invoke(cli, ["dump", "--dpn", "--dqs"], env={"PLPATH": str(tmp_path)}) + assert result.exit_code == 0, pformat(traceback.format_exception(*result.exc_info)) + + +@pytest.mark.parametrize("option", get_data_prefix()) +def test_cli_00(option): + """test the use of job with various datasets.""" + runner = CliRunner() + result = runner.invoke(cli, option, env={"PLPATH": LOADDATA_DIR}) + assert result.exit_code == 0, pformat(traceback.format_exception(*result.exc_info)) + + +@pytest.mark.parametrize( + "option", + get_data_prefix( + prefix_lst=[ + (), + ] + ), +) +@pytest.mark.skipif(pbs_ifl is None, reason="could not import pbs_ifl") +def test_job_ifl_00(option): + runner = CliRunner() + result = runner.invoke(cli, option, env={"PLPATH": LOADDATA_DIR}) + assert result.exit_code == 0, pformat(traceback.format_exception(*result.exc_info))