From 14c227026f2de776f15b712ad778ced9adce787d Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Wed, 23 Oct 2024 10:02:25 +0200 Subject: [PATCH 01/10] dell: Do not discover OEM endpoint at every run get_oem_system() is trying to find the right endpoint as depending on systems they might vary. During the monitoring loop, in your are in the case where we need to use the /Attributes, the function generates two redfish calls. Once the right endpoint is found, let's reuse it directly at each call to save time and reduce the number of redfish call we perform. Signed-off-by: Erwan Velu --- hwbench/environment/vendors/dell/dell.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/hwbench/environment/vendors/dell/dell.py b/hwbench/environment/vendors/dell/dell.py index c0aaa62..6930acd 100644 --- a/hwbench/environment/vendors/dell/dell.py +++ b/hwbench/environment/vendors/dell/dell.py @@ -11,6 +11,8 @@ class IDRAC(BMC): + oem_endpoint = "" + def get_thermal(self): return self.get_redfish_url("/redfish/v1/Chassis/System.Embedded.1/Thermal") @@ -42,17 +44,27 @@ def get_power(self): return self.get_redfish_url("/redfish/v1/Chassis/System.Embedded.1/Power") def get_oem_system(self): + # If we already found the proper endpoint, let's reuse it. + if self.oem_endpoint: + return self.get_redfish_url( + self.oem_endpoint, + log_failure=False, + ) + + new_oem_endpoint = "/redfish/v1/Managers/iDRAC.Embedded.1/Oem/Dell/DellAttributes/System.Embedded.1" oem = self.get_redfish_url( - "/redfish/v1/Managers/iDRAC.Embedded.1/Oem/Dell/DellAttributes/System.Embedded.1", + new_oem_endpoint, log_failure=False, ) # If not System.Embedded, let's use the default attributes if "Attributes" not in oem: - oem = self.get_redfish_url( - "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes" - ) + new_oem_endpoint = "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes" + oem = self.get_redfish_url(new_oem_endpoint) if "Attributes" not in oem: h.fatal("Cannot find Dell OEM metrics, please fill a bug.") + + # Let's save the endpoint to avoid trying all of them at every run + self.oem_endpoint = new_oem_endpoint return oem def read_power_consumption( From f51640182dc129dac45a20e29829cb06f15fa05e Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Wed, 23 Oct 2024 14:04:13 +0200 Subject: [PATCH 02/10] hpe/ilorest: Prevent crash on single-node chassis A typical crash trace look like : Traceback (most recent call last): File "/usr/lib64/python3.9/runpy.py", line 197, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib64/python3.9/runpy.py", line 87, in _run_code exec(code, run_globals) File "/home/e.velu/hwbench/hwbench.py", line 117, in main() File "/home/e.velu/hwbench/hwbench.py", line 44, in main benches.parse_jobs_config() File "/home/e.velu/hwbench/bench/benchmarks.py", line 128, in parse_jobs_config self.__schedule_benchmarks( File "/home/e.velu/hwbench/bench/benchmarks.py", line 145, in __schedule_benchmarks self.__schedule_benchmark(job, pinned_cpu, emp, validate_parameters) File "/home/e.velu/hwbench/bench/benchmarks.py", line 160, in __schedule_benchmark self.hardware.vendor.get_bmc().connect_redfish() File "/home/e.velu/hwbench/environment/vendors/bmc.py", line 82, in connect_redfish return super().connect_redfish(bmc_username, bmc_password, self.get_url()) File "/home/e.velu/hwbench/environment/vendors/hpe/hpe.py", line 31, in get_url ipv4 = self.ilo.get_bmc_ipv4() File "/home/e.velu/hwbench/environment/vendors/hpe/ilorest.py", line 149, in get_bmc_ipv4 if "Manager Dedicated Network Interface" in nc.get("Name"): AttributeError: 'str' object has no attribute 'get' get_bmc_ipv4() make the following ilorest call : ilorest list --select ethernetinterface --filter "id=1" -j. On multi-node chassis like Apollo 2000 Gen10, the output is a list of interfaces. On single-node chassis like a DL380, the output is a dict. The code was considering the first case and crashed on the single-node servers. This patch ensures that if we get a single interface, we convert it into a single item list. This allow us having the same parsing code for both cases. Tested on DL380, ilorest 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.03' Tested on XL225n (Apollo 2000 Gen10), iloreset 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.08' Tested on DL340Gen12, ilorest 4.9.0.0, Model: 'iLO 6' FW: 'iLO 6 v1.62' Signed-off-by: Erwan Velu --- hwbench/environment/vendors/hpe/ilorest.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hwbench/environment/vendors/hpe/ilorest.py b/hwbench/environment/vendors/hpe/ilorest.py index e458988..7e95a6e 100644 --- a/hwbench/environment/vendors/hpe/ilorest.py +++ b/hwbench/environment/vendors/hpe/ilorest.py @@ -140,6 +140,11 @@ def get_bmc_ipv4(self): select="ethernetinterface", filter="id=1", to_json=True ) if bmc_netconfig: + # On multi-node chassis, the ethernetinterface is a list + # On single-node chassis, the ethernetinterface is a dict + # Let's ensure we always have a list for get a single parsing. + if isinstance(bmc_netconfig, dict): + bmc_netconfig = [bmc_netconfig] for nc in bmc_netconfig: if "Manager Dedicated Network Interface" in nc.get("Name"): ipv4 = nc.get("IPv4Addresses") From 5528f197706fc064715e3e32061ad567b0f390c4 Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Wed, 23 Oct 2024 15:05:48 +0200 Subject: [PATCH 03/10] hpe: Do not crash if not all PSUs are enabled When a PSU is missing the PowerSupplies endpoint looks like the following : {'@odata.id': '/redfish/v1/Chassis/1/Power/#PowerSupplies/0', 'FirmwareVersion': '0.23.49', 'LastPowerOutputWatts': 144, 'LineInputVoltage': 233, 'LineInputVoltageType': 'ACHighLine', 'Manufacturer': None, 'MemberId': '0', 'Model': 'P67240-B21', 'Name': 'HpeServerPowerSupply', 'Oem': {'Hpe': {'@odata.context': '/redfish/v1/$metadata#HpeServerPowerSupply.HpeServerPowerSupply', '@odata.type': '#HpeServerPowerSupply.v2_0_0.HpeServerPowerSupply', 'AveragePowerOutputWatts': 144, 'BayNumber': 1, 'Domain': 'System', 'HotplugCapable': True, 'MaxPowerOutputWatts': 143, 'Mismatched': False, 'PowerSupplyStatus': {'State': 'Ok'}, 'iPDUCapable': False}}, 'PowerCapacityWatts': 1000, 'PowerSupplyType': 'Unknown', 'SerialNumber': 'XXXXXXXXXXU', 'SparePartNumber': 'P68455-001', 'Status': {'Health': 'OK', 'State': 'Enabled'}} {'@odata.id': '/redfish/v1/Chassis/1/Power/#PowerSupplies/1', 'MemberId': '1', 'Oem': {'Hpe': {'@odata.context': '/redfish/v1/$metadata#HpeServerPowerSupply.HpeServerPowerSupply', '@odata.type': '#HpeServerPowerSupply.v2_0_0.HpeServerPowerSupply', 'BayNumber': 2}}, 'Status': {'Health': 'Warning', 'State': 'Absent'}} The missing 'Name' member, for PSU 1 here, crashed the application. This commit is about managing the PSU presence by checking the 'Status' member and its content. If no 'Status' and/or 'State' found, a custom error message will be reported to the user. A cache is used to avoid polluting the output. Tested on DL380, ilorest 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.03' Tested on XL225n (Apollo 2000 Gen10), iloreset 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.08' Tested on DL340Gen12, ilorest 4.9.0.0, Model: 'iLO 6' FW: 'iLO 6 v1.62' Reported-by: Elyes ZEKRI Signed-off-by: Erwan Velu --- hwbench/environment/vendors/hpe/hpe.py | 43 ++++++++++++++++++++------ 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/hwbench/environment/vendors/hpe/hpe.py b/hwbench/environment/vendors/hpe/hpe.py index 3a6577a..9293490 100644 --- a/hwbench/environment/vendors/hpe/hpe.py +++ b/hwbench/environment/vendors/hpe/hpe.py @@ -1,5 +1,7 @@ +import logging import pathlib import re +from functools import cache from typing import cast from ....bench.monitoring_structs import ( @@ -84,6 +86,10 @@ def add(self, name): def get_power(self): return self.get_redfish_url("/redfish/v1/Chassis/1/Power/") + @cache + def __warn_psu(self, psu_number, message): + logging.error(f"PSU {psu_number}: {message}") + def read_power_supplies( self, power_supplies: dict[str, dict[str, Power]] = {} ) -> dict[str, dict[str, Power]]: @@ -91,17 +97,34 @@ def read_power_supplies( if str(PowerContext.BMC) not in power_supplies: power_supplies[str(PowerContext.BMC)] = {} # type: ignore[no-redef] for psu in self.get_power().get("PowerSupplies"): - # Both PSUs are named the same (HpeServerPowerSupply) + psu_position = str(psu["Oem"]["Hpe"]["BayNumber"]) + # All PSUs are named the same (HpeServerPowerSupply) # Let's update it to have a unique name - name = psu["Name"] + str(psu["Oem"]["Hpe"]["BayNumber"]) - psu_name = "PS" + str(psu["Oem"]["Hpe"]["BayNumber"]) - super().add_monitoring_value( - cast(dict[str, dict[str, MonitorMetric]], power_supplies), - PowerContext.BMC, - Power(psu_name), - name, - psu["Oem"]["Hpe"]["AveragePowerOutputWatts"], - ) + psu_status = psu.get("Status") + if psu_status: + psu_state = psu_status.get("State") + if psu_state: + # We only consider healthy PSU + if str(psu_state).lower() == "enabled": + name = psu["Name"] + psu_position + psu_name = "PS" + psu_position + super().add_monitoring_value( + cast(dict[str, dict[str, MonitorMetric]], power_supplies), + PowerContext.BMC, + Power(psu_name), + name, + psu["Oem"]["Hpe"]["AveragePowerOutputWatts"], + ) + else: + # Let's inform the user the PSU is reported as non healthy + self.__warn_psu( + psu_position, + f'marked as {psu_state} in {psu_status.get("Health")} state', + ) + continue + + # Let's inform the user that no status was found, maybe a parsing or fw issue ? + self.__warn_psu(psu_position, "no status or state found !") return power_supplies From d79bb7fe00a6cf62b9cdb80ce87b9ce004336e5c Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Wed, 23 Oct 2024 15:21:05 +0200 Subject: [PATCH 04/10] hpe: Remove redundant calls to enclosurechassis endpoint On mono-node HPE chassis, the current code is calling the enclosurechassis endpoint for each monitoring loop. This call is perfectly useless since the enclosurechassis does not exists. This patch is about adding a is_multinode_chassis() function to be cached that try this endpoint and get a constant answer. Regarding this cached status, a real read of the enclosurechassis is done for multinode chassis only. Tested on DL380, ilorest 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.03' Tested on XL225n (Apollo 2000 Gen10), iloreset 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.08' Tested on DL340Gen12, ilorest 4.9.0.0, Model: 'iLO 6' FW: 'iLO 6 v1.62' Signed-off-by: Erwan Velu --- hwbench/environment/vendors/hpe/hpe.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/hwbench/environment/vendors/hpe/hpe.py b/hwbench/environment/vendors/hpe/hpe.py index 9293490..f6b939a 100644 --- a/hwbench/environment/vendors/hpe/hpe.py +++ b/hwbench/environment/vendors/hpe/hpe.py @@ -168,11 +168,23 @@ def read_power_consumption( ) return power_consumption - def get_oem_chassis(self): - return self.get_redfish_url( - "/redfish/v1/Chassis/enclosurechassis/", log_failure=False + @cache + def is_multinode_chassis(self) -> bool: + return ( + True + if self.get_redfish_url( + "/redfish/v1/Chassis/enclosurechassis/", log_failure=False + ) + else False ) + def get_oem_chassis(self): + if self.is_multinode_chassis(): + return self.get_redfish_url( + "/redfish/v1/Chassis/enclosurechassis/", log_failure=False + ) + return {} + class Hpe(Vendor): def __init__(self, out_dir, dmi, monitoring_config_filename): From 47bc82b7f38b360d503df5a21a0d48c0d7bb98eb Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Wed, 23 Oct 2024 18:53:02 +0200 Subject: [PATCH 05/10] monitoring: Increasing turbostat time budget By default, turbostat was given 200ms to start & complete. When CPUs do not have hyperthreading and all cores are loaded by a benchmark, like stress-ng, turbostat takes more than 200ms. As a result, the self.turbostat.parse() call in monitoring.py is waiting turbostat to complete which generates a timing overdue. This was generating traces like : hwbench: 1 jobs, 1 benchmarks, ETA 0h 01m 00s [full_cpu_load] stressng/cpu/matrixprod(M): 128 stressor on CPU [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127] for 60s Monitoring iteration 8 is 0.12ms late Monitoring iteration 9 is 0.47ms late Monitoring iteration 10 is 0.78ms late Monitoring iteration 11 is 1.19ms late Monitoring iteration 12 is 1.28ms late Monitoring iteration 13 is 1.59ms late Monitoring iteration 14 is 1.90ms late Monitoring iteration 15 is 2.22ms late Monitoring iteration 16 is 2.53ms late Monitoring iteration 17 is 2.85ms late Monitoring iteration 18 is 3.54ms late Monitoring iteration 19 is 3.48ms late Monitoring iteration 20 is 3.82ms late Monitoring iteration 21 is 4.15ms late Monitoring iteration 22 is 4.42ms late Monitoring iteration 23 is 4.71ms late Monitoring iteration 24 is 5.02ms late Monitoring iteration 25 is 5.57ms late Monitoring iteration 26 is 5.66ms late Monitoring iteration 27 is 5.89ms late Monitoring iteration 28 is 5.77ms late This patch is lamely increasing the time budget to 500ms. On the affeacted machines, it solved the issue. Maybe at some point we'll have to increase the 'precision' window to get a better ratio time budget vs run time. Tested on Intel(R) Xeon(R) 6756E systems. Signed-off-by: Erwan Velu --- hwbench/bench/monitoring.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hwbench/bench/monitoring.py b/hwbench/bench/monitoring.py index 9507180..c43376b 100644 --- a/hwbench/bench/monitoring.py +++ b/hwbench/bench/monitoring.py @@ -192,8 +192,8 @@ def next_iter(): start_time = self.get_monotonic_clock() if self.turbostat: # Turbostat will run for the whole duration of this loop - # We just retract a 2/10th of second to ensure it will not overdue - self.turbostat.run(interval=(precision - 0.2)) + # We just retract a 5/10th of second to ensure it will not overdue + self.turbostat.run(interval=(precision - 0.5)) # Let's monitor the time spent at monitoring the CPU self.get_metric(Metrics.MONITOR)["CPU"]["Polling"].add( (self.get_monotonic_clock() - start_time) * 1e-6 From 503e5bae32200f7a7226945d1feaa62d62a2428e Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Fri, 25 Oct 2024 11:40:44 +0200 Subject: [PATCH 06/10] hwgraph: Do not crash with empty monitoring metrics If a monitoring metric is empty, do not crash hwgraph but rather display a warning. This was seen in issue #35. The following metric was causing the issue : "PDU": { "Polling": { "name": "Polling", "unit": "ms", "mean": [], "min": [], "max": [], "stdev": [], "samples": [] } }, With this commit a typical output looks like the following: environment: rendering 131 jobs from scaleway/hwbench-out-20241023125613/results.json (DL320) avx_1: No samples found in PDU.Polling, ignoring metric. avx_10: No samples found in PDU.Polling, ignoring metric. avx_11: No samples found in PDU.Polling, ignoring metric. avx_12: No samples found in PDU.Polling, ignoring metric. avx_13: No samples found in PDU.Polling, ignoring metric. Reported-by: Elyes ZEKRI Signed-off-by: Erwan Velu --- graph/hwgraph.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/graph/hwgraph.py b/graph/hwgraph.py index 798a8d3..27d5334 100755 --- a/graph/hwgraph.py +++ b/graph/hwgraph.py @@ -94,14 +94,20 @@ def graph_monitoring_metrics(args, trace: Trace, bench_name: str, output_dir) -> metrics = bench.get_component(Metrics.MONITOR, metric_name) if metrics: for metric in metrics: - rendered_graphs += yerr_graph( - args, - output_dir, - bench, - Metrics.MONITOR, - metrics[metric], - prefix=f"{metric_name}.", - ) + # If a metric has no measure, let's ignore it + if len(metrics[metric].get_samples()) == 0: + print( + f"{bench_name}: No samples found in {metric_name}.{metric}, ignoring metric." + ) + else: + rendered_graphs += yerr_graph( + args, + output_dir, + bench, + Metrics.MONITOR, + metrics[metric], + prefix=f"{metric_name}.", + ) return rendered_graphs From c7669f9cb362a13259131a4ff0a10c0fa4bc6fde Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Fri, 25 Oct 2024 12:44:48 +0200 Subject: [PATCH 07/10] hwgraph: Adding metric get_full_name() When loading metrics, it appear that some can have the same name making invalid data, data name and misleading graphs. This was seen in issue #36. The BMC reported the two following structures : "CPU": { "02-CPU 1 PkgTmp": { "name": "CPU1", "unit": "Celsius", "mean": [ [...] "12-CPU 1": { "name": "CPU1", "unit": "Celsius", "mean": [ As we use the name member, we got the two different metrics being merged as a single one. This patch is adding a workaround by consider the parent's name as the "full_name" of the metric. So this patch is adding this value, like "12-CPU 1", as the full name and reachable via get_full_name(). This commit ensures the env graphs are using this full name to avoid smashing metrics. Signed-off-by: Erwan Velu --- graph/graph.py | 10 +++++----- graph/trace.py | 2 +- hwbench/bench/monitoring_structs.py | 9 ++++++++- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/graph/graph.py b/graph/graph.py index e1ce3c0..54ea058 100644 --- a/graph/graph.py +++ b/graph/graph.py @@ -260,9 +260,9 @@ def generic_graph( time_serie.append(time) # Collect all components mean value for component in components: - if component.get_name() not in data_serie: - data_serie[component.get_name()] = [] - data_serie[component.get_name()].append(component.get_mean()[sample]) + if component.get_full_name() not in data_serie: + data_serie[component.get_full_name()] = [] + data_serie[component.get_full_name()].append(component.get_mean()[sample]) if second_axis: for _, entry in bench.get_monitoring_metric(second_axis).items(): @@ -291,8 +291,8 @@ def generic_graph( graph.get_ax2().plot(x_serie, y2_serie, "", label=data2_item, marker="o") for component in components: - y_serie = np.array(data_serie[component.get_name()])[order] - graph.get_ax().plot(x_serie, y_serie, "", label=component.get_name()) + y_serie = np.array(data_serie[component.get_full_name()])[order] + graph.get_ax().plot(x_serie, y_serie, "", label=component.get_full_name()) graph.prepare_axes( 30, diff --git a/graph/trace.py b/graph/trace.py index 7e8bbca..beed903 100644 --- a/graph/trace.py +++ b/graph/trace.py @@ -82,7 +82,7 @@ def load_monitoring(self): mm = Temperature(measure, original_measure["unit"]) else: mm = MonitorMetric(measure, original_measure["unit"]) - mm.load_from_dict(original_measure) + mm.load_from_dict(original_measure, measure) self.metrics[metric][component_family][measure] = mm else: fatal(f"Unexpected {metric} in monitoring") diff --git a/hwbench/bench/monitoring_structs.py b/hwbench/bench/monitoring_structs.py index 9997dab..b43ce24 100644 --- a/hwbench/bench/monitoring_structs.py +++ b/hwbench/bench/monitoring_structs.py @@ -26,7 +26,8 @@ def __post_init__(self): # If a value is given, let's add it self.add(self.value) - def load_from_dict(self, input: dict): + def load_from_dict(self, input: dict, full_name: str): + self.full_name = full_name self.name = str(input.get("name")) self.unit = str(input.get("unit")) self.mean = input.get("mean") # type: ignore[assignment] @@ -35,6 +36,12 @@ def load_from_dict(self, input: dict): self.stdev = input.get("stdev") # type: ignore[assignment] self.samples = input.get("samples") # type: ignore[assignment] + def get_full_name(self): + """Return the metric full name""" + if self.full_name: + return self.full_name + return self.name + def get_name(self): """Return the metric name""" return self.name From 30848178ca583e41d0531202113b8fd37f650226 Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Fri, 25 Oct 2024 14:46:19 +0200 Subject: [PATCH 08/10] hwbench: Using absolute paths for out & tuning directories As per issue #38, the dmidecode call to save the SMBIOS table in a binary file for later reading is broken as the file is not present into the hwbench-out directory. It appear that when dmidecode --dump-file is called, that is done with a relative path. The stderr log show the following error : hwbench-out-20241025123459/dmidecode.bin: open: No such file or directory It appear the relative path passed to dmidecode is not existing because dmidecode is not started in the same home dir as we are currently running the tool. This commit is about using an absolute path instead of a relative one to ensure that every command/tool can dump content in the proper directory. With this patch, the dmidecode-bin file is now present in the hwbench-out archive. Signed-off-by: Erwan Velu --- hwbench/hwbench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hwbench/hwbench.py b/hwbench/hwbench.py index 3f2fb9a..9124743 100755 --- a/hwbench/hwbench.py +++ b/hwbench/hwbench.py @@ -62,7 +62,7 @@ def create_output_directory() -> tuple[pathlib.Path, pathlib.Path]: tuning_out_dir = out_dir / "tuning" tuning_out_dir.mkdir() - return out_dir, tuning_out_dir + return out_dir.absolute(), tuning_out_dir.absolute() def parse_options(): From f8c8e15422550a3c5a52b4e0ca2fdbe192f137fc Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Fri, 25 Oct 2024 15:36:59 +0200 Subject: [PATCH 09/10] hwgraph: Add option to manage missing data points. As reported in issue #40, when data points are missing, hwgraph was crashing because graphs must have the same number of data points. In such case, let's default to a fatal message to the user instead of crahsing. This commit also add a new hwgraph option, --ignore-missing-datapoint that can be used to define a behavior to ignore this case and graph. --ignore-missing-datapoint=zero will replace the missing datapoint by a 0 value. --ignore-missing-datapoint=last will replace the missing datapoint by the last known value of the serie. It's up to the user to decide which one is the best but also understand that graphs will be partially wrong. By default, hwgraph generates a fatal error. Signed-off-by: Erwan Velu --- graph/graph.py | 25 ++++++++++++++++++++++++- graph/hwgraph.py | 7 +++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/graph/graph.py b/graph/graph.py index 54ea058..a35598b 100644 --- a/graph/graph.py +++ b/graph/graph.py @@ -262,7 +262,30 @@ def generic_graph( for component in components: if component.get_full_name() not in data_serie: data_serie[component.get_full_name()] = [] - data_serie[component.get_full_name()].append(component.get_mean()[sample]) + # If we are missing some datapoints .... + if len(component.get_mean()) <= sample: + # If the user didn't explictely agreed to be replaced by 0, let's be fatal + if not args.ignore_missing_datapoint: + fatal( + f"{trace.get_name()}/{bench.get_bench_name()}: {component.get_full_name()} is missing the {sample+1}th data point.\ + Use --ignore-missing-datapoint to ignore this case. Generated graphs will be partially incorrect." + ) + else: + # User is fine with a missing data to be replaced. + # Let's do that so we can render things properly. + + # Let's pick the last known value + if args.ignore_missing_datapoint == "last": + data_serie[component.get_full_name()].append( + component.get_mean()[-1] + ) + else: + # Replace it by a zero + data_serie[component.get_full_name()].append(0) + else: + data_serie[component.get_full_name()].append( + component.get_mean()[sample] + ) if second_axis: for _, entry in bench.get_monitoring_metric(second_axis).items(): diff --git a/graph/hwgraph.py b/graph/hwgraph.py index 27d5334..4a5beb4 100755 --- a/graph/hwgraph.py +++ b/graph/hwgraph.py @@ -350,6 +350,13 @@ def main(): action="store_true", help="Enable verbose mode", ) + + parser_graph.add_argument( + "--ignore-missing-datapoint", + choices=["zero", "last"], + default="", + help="Replace a missing datapoint instead of stopping the rendering. Could be by a zero or the last known value.", + ) parser_graph.set_defaults(func=render_traces) parser_list = subparsers.add_parser( From eef217ae24802247ec3dd15bf4c146b71e2d98ef Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Fri, 25 Oct 2024 16:49:51 +0200 Subject: [PATCH 10/10] graph: Adding CPU clock scaling graph As per issue #39, it could be super useful to have a new type of graph to represent how the CPU clock performed during a benchmark. This new graph gives a quick overview on how the CPU frequency behave during a scaling benchmark. The rendering is not as precise as environment graphs, but it gives a brief overview with the following trade-offs for the y-err bars: - min of the yerr-bar, is the min of min values - mean of the yerr-bar, is the mean of mean values - max of the yerr-bar, is the max of the max values Signed-off-by: Erwan Velu --- graph/graph.py | 2 +- graph/scaling.py | 25 +++++++++++++++++++++++++ graph/trace.py | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/graph/graph.py b/graph/graph.py index a35598b..cebfaac 100644 --- a/graph/graph.py +++ b/graph/graph.py @@ -17,7 +17,7 @@ def init_matplotlib(args): fatal(f"Cannot load matplotlib backend engine {args.engine}") -GRAPH_TYPES = ["perf", "perf_watt", "watts"] +GRAPH_TYPES = ["perf", "perf_watt", "watts", "cpu_clock"] class Graph: diff --git a/graph/scaling.py b/graph/scaling.py index 78af7f9..bb448b8 100644 --- a/graph/scaling.py +++ b/graph/scaling.py @@ -25,6 +25,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int: aggregated_perfs_watt = {} # type: dict[str, dict[str, Any]] aggregated_watt = {} # type: dict[str, dict[str, Any]] aggregated_watt_err = {} # type: dict[str, dict[str, Any]] + aggregated_cpu_clock = {} # type: dict[str, dict[str, Any]] + aggregated_cpu_clock_err = {} # type: dict[str, dict[str, Any]] workers = {} # type: dict[str, list] logical_core_per_worker = [] perf_list, unit = benches[emp]["metrics"] @@ -41,6 +43,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int: aggregated_perfs_watt[perf] = {} aggregated_watt[perf] = {} aggregated_watt_err[perf] = {} + aggregated_cpu_clock[perf] = {} + aggregated_cpu_clock_err[perf] = {} # For every trace file given at the command line for trace in args.traces: workers[trace.get_name()] = [] @@ -63,6 +67,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int: aggregated_perfs_watt[perf][trace.get_name()] = [] aggregated_watt[perf][trace.get_name()] = [] aggregated_watt_err[perf][trace.get_name()] = [] + aggregated_cpu_clock[perf][trace.get_name()] = [] + aggregated_cpu_clock_err[perf][trace.get_name()] = [] bench.add_perf( perf, @@ -70,6 +76,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int: perf_watt=aggregated_perfs_watt[perf][trace.get_name()], watt=aggregated_watt[perf][trace.get_name()], watt_err=aggregated_watt_err[perf][trace.get_name()], + cpu_clock=aggregated_cpu_clock[perf][trace.get_name()], + cpu_clock_err=aggregated_cpu_clock_err[perf][trace.get_name()], ) # Let's render all graphs types @@ -94,6 +102,13 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int: outfile = f"scaling_watt_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}" y_label = "Watts" y_source = aggregated_watt + elif "cpu_clock" in graph_type: + graph_type_title = ( + f"Scaling {graph_type}: {args.traces[0].get_metric_name()}" + ) + outfile = f"scaling_cpu_clock_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}" + y_label = "Mhz" + y_source = aggregated_cpu_clock else: graph_type_title = ( f"Scaling {graph_type}: {bench.get_title_engine_name()}" @@ -164,6 +179,16 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int: capsize=4, label=trace_name, ) + elif y_source == aggregated_cpu_clock: + graph.get_ax().errorbar( + x_serie, + y_serie, + yerr=np.array(aggregated_cpu_clock_err[perf][trace_name]).T, + ecolor=e_color, + color=color_name, + capsize=4, + label=trace_name, + ) else: graph.get_ax().plot( x_serie, diff --git a/graph/trace.py b/graph/trace.py index beed903..36fb7ae 100644 --- a/graph/trace.py +++ b/graph/trace.py @@ -232,6 +232,8 @@ def add_perf( perf_watt=None, watt=None, watt_err=None, + cpu_clock=None, + cpu_clock_err=None, index=None, ) -> None: """Extract performance and power efficiency""" @@ -316,6 +318,41 @@ def add_perf( watt_err.append(metric) else: watt_err[index] = metric + + if cpu_clock is not None: + mm = self.get_monitoring_metric(Metrics.FREQ) + mean_values = [] + min_values = [] + max_values = [] + + for freq_metric in mm: + if freq_metric != "CPU": + continue + # We have to compute metrics of all systems cores + for core in mm[freq_metric]: + # MIN of min ? + # Mean of mean ? + # Max of max ? + min_values.append(min(mm[freq_metric][core].get_min())) + mean_values.append(mean(mm[freq_metric][core].get_mean())) + max_values.append(max(mm[freq_metric][core].get_max())) + min_value = min(min_values) + mean_value = mean(mean_values) + max_value = max(max_values) + + if index is None: + cpu_clock.append(mean_value) + else: + cpu_clock[index] = mean_value + + # If we want to keep the error distribution to plot error bars + if cpu_clock_err is not None: + metric = (mean_value - min_value, max_value - mean_value) + if index is None: + cpu_clock_err.append(metric) + else: + cpu_clock_err[index] = metric + except ValueError: fatal(f"No {perf} found in {self.get_bench_name()}")