From 14c227026f2de776f15b712ad778ced9adce787d Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Wed, 23 Oct 2024 10:02:25 +0200
Subject: [PATCH 01/10] dell: Do not discover OEM endpoint at every run

get_oem_system() is trying to find the right endpoint as depending on
systems they might vary.

During the monitoring loop, in your are in the case where we need to use
the /Attributes, the function generates two redfish calls.

Once the right endpoint is found, let's reuse it directly at each call
to save time and reduce the number of redfish call we perform.

Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 hwbench/environment/vendors/dell/dell.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/hwbench/environment/vendors/dell/dell.py b/hwbench/environment/vendors/dell/dell.py
index c0aaa62..6930acd 100644
--- a/hwbench/environment/vendors/dell/dell.py
+++ b/hwbench/environment/vendors/dell/dell.py
@@ -11,6 +11,8 @@
 
 
 class IDRAC(BMC):
+    oem_endpoint = ""
+
     def get_thermal(self):
         return self.get_redfish_url("/redfish/v1/Chassis/System.Embedded.1/Thermal")
 
@@ -42,17 +44,27 @@ def get_power(self):
         return self.get_redfish_url("/redfish/v1/Chassis/System.Embedded.1/Power")
 
     def get_oem_system(self):
+        # If we already found the proper endpoint, let's reuse it.
+        if self.oem_endpoint:
+            return self.get_redfish_url(
+                self.oem_endpoint,
+                log_failure=False,
+            )
+
+        new_oem_endpoint = "/redfish/v1/Managers/iDRAC.Embedded.1/Oem/Dell/DellAttributes/System.Embedded.1"
         oem = self.get_redfish_url(
-            "/redfish/v1/Managers/iDRAC.Embedded.1/Oem/Dell/DellAttributes/System.Embedded.1",
+            new_oem_endpoint,
             log_failure=False,
         )
         # If not System.Embedded, let's use the default attributes
         if "Attributes" not in oem:
-            oem = self.get_redfish_url(
-                "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes"
-            )
+            new_oem_endpoint = "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes"
+            oem = self.get_redfish_url(new_oem_endpoint)
             if "Attributes" not in oem:
                 h.fatal("Cannot find Dell OEM metrics, please fill a bug.")
+
+        # Let's save the endpoint to avoid trying all of them at every run
+        self.oem_endpoint = new_oem_endpoint
         return oem
 
     def read_power_consumption(

From f51640182dc129dac45a20e29829cb06f15fa05e Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Wed, 23 Oct 2024 14:04:13 +0200
Subject: [PATCH 02/10] hpe/ilorest: Prevent crash on single-node chassis

A typical crash trace look like :
Traceback (most recent call last):
  File "/usr/lib64/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib64/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/e.velu/hwbench/hwbench.py", line 117, in <module>
    main()
  File "/home/e.velu/hwbench/hwbench.py", line 44, in main
    benches.parse_jobs_config()
  File "/home/e.velu/hwbench/bench/benchmarks.py", line 128, in parse_jobs_config
    self.__schedule_benchmarks(
  File "/home/e.velu/hwbench/bench/benchmarks.py", line 145, in __schedule_benchmarks
    self.__schedule_benchmark(job, pinned_cpu, emp, validate_parameters)
  File "/home/e.velu/hwbench/bench/benchmarks.py", line 160, in __schedule_benchmark
    self.hardware.vendor.get_bmc().connect_redfish()
  File "/home/e.velu/hwbench/environment/vendors/bmc.py", line 82, in connect_redfish
    return super().connect_redfish(bmc_username, bmc_password, self.get_url())
  File "/home/e.velu/hwbench/environment/vendors/hpe/hpe.py", line 31, in get_url
    ipv4 = self.ilo.get_bmc_ipv4()
  File "/home/e.velu/hwbench/environment/vendors/hpe/ilorest.py", line 149, in get_bmc_ipv4
    if "Manager Dedicated Network Interface" in nc.get("Name"):
AttributeError: 'str' object has no attribute 'get'

get_bmc_ipv4() make the following ilorest call :  ilorest list  --select ethernetinterface --filter "id=1" -j.

On multi-node chassis like Apollo 2000 Gen10, the output is a list of interfaces.
On single-node chassis like a DL380, the output is a dict.

The code was considering the first case and crashed on the single-node servers.

This patch ensures that if we get a single interface, we convert it into
a single item list. This allow us having the same parsing code for both
cases.

Tested on DL380, ilorest 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.03'
Tested on XL225n (Apollo 2000 Gen10), iloreset 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.08'
Tested on DL340Gen12, ilorest 4.9.0.0, Model: 'iLO 6' FW: 'iLO 6 v1.62'

Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 hwbench/environment/vendors/hpe/ilorest.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hwbench/environment/vendors/hpe/ilorest.py b/hwbench/environment/vendors/hpe/ilorest.py
index e458988..7e95a6e 100644
--- a/hwbench/environment/vendors/hpe/ilorest.py
+++ b/hwbench/environment/vendors/hpe/ilorest.py
@@ -140,6 +140,11 @@ def get_bmc_ipv4(self):
             select="ethernetinterface", filter="id=1", to_json=True
         )
         if bmc_netconfig:
+            # On multi-node chassis, the ethernetinterface is a list
+            # On single-node chassis, the ethernetinterface is a dict
+            # Let's ensure we always have a list for get a single parsing.
+            if isinstance(bmc_netconfig, dict):
+                bmc_netconfig = [bmc_netconfig]
             for nc in bmc_netconfig:
                 if "Manager Dedicated Network Interface" in nc.get("Name"):
                     ipv4 = nc.get("IPv4Addresses")

From 5528f197706fc064715e3e32061ad567b0f390c4 Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Wed, 23 Oct 2024 15:05:48 +0200
Subject: [PATCH 03/10] hpe: Do not crash if not all PSUs are enabled

When a PSU is missing the PowerSupplies endpoint looks like the
following :

{'@odata.id': '/redfish/v1/Chassis/1/Power/#PowerSupplies/0',
 'FirmwareVersion': '0.23.49',
 'LastPowerOutputWatts': 144,
 'LineInputVoltage': 233,
 'LineInputVoltageType': 'ACHighLine',
 'Manufacturer': None,
 'MemberId': '0',
 'Model': 'P67240-B21',
 'Name': 'HpeServerPowerSupply',
 'Oem': {'Hpe': {'@odata.context': '/redfish/v1/$metadata#HpeServerPowerSupply.HpeServerPowerSupply',
                 '@odata.type': '#HpeServerPowerSupply.v2_0_0.HpeServerPowerSupply',
                 'AveragePowerOutputWatts': 144,
                 'BayNumber': 1,
                 'Domain': 'System',
                 'HotplugCapable': True,
                 'MaxPowerOutputWatts': 143,
                 'Mismatched': False,
                 'PowerSupplyStatus': {'State': 'Ok'},
                 'iPDUCapable': False}},
 'PowerCapacityWatts': 1000,
 'PowerSupplyType': 'Unknown',
 'SerialNumber': 'XXXXXXXXXXU',
 'SparePartNumber': 'P68455-001',
 'Status': {'Health': 'OK', 'State': 'Enabled'}}
{'@odata.id': '/redfish/v1/Chassis/1/Power/#PowerSupplies/1',
 'MemberId': '1',
 'Oem': {'Hpe': {'@odata.context': '/redfish/v1/$metadata#HpeServerPowerSupply.HpeServerPowerSupply',
                 '@odata.type': '#HpeServerPowerSupply.v2_0_0.HpeServerPowerSupply',
                 'BayNumber': 2}},
 'Status': {'Health': 'Warning', 'State': 'Absent'}}

The missing 'Name' member, for PSU 1 here, crashed the application.

This commit is about managing the PSU presence by checking the 'Status' member and its content.
If no 'Status' and/or 'State' found, a custom error message will be reported to the user.
A cache is used to avoid polluting the output.

Tested on DL380, ilorest 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.03'
Tested on XL225n (Apollo 2000 Gen10), iloreset 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.08'
Tested on DL340Gen12, ilorest 4.9.0.0, Model: 'iLO 6' FW: 'iLO 6 v1.62'

Reported-by: Elyes ZEKRI <ezekri@scaleway.com>
Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 hwbench/environment/vendors/hpe/hpe.py | 43 ++++++++++++++++++++------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/hwbench/environment/vendors/hpe/hpe.py b/hwbench/environment/vendors/hpe/hpe.py
index 3a6577a..9293490 100644
--- a/hwbench/environment/vendors/hpe/hpe.py
+++ b/hwbench/environment/vendors/hpe/hpe.py
@@ -1,5 +1,7 @@
+import logging
 import pathlib
 import re
+from functools import cache
 from typing import cast
 
 from ....bench.monitoring_structs import (
@@ -84,6 +86,10 @@ def add(self, name):
     def get_power(self):
         return self.get_redfish_url("/redfish/v1/Chassis/1/Power/")
 
+    @cache
+    def __warn_psu(self, psu_number, message):
+        logging.error(f"PSU {psu_number}: {message}")
+
     def read_power_supplies(
         self, power_supplies: dict[str, dict[str, Power]] = {}
     ) -> dict[str, dict[str, Power]]:
@@ -91,17 +97,34 @@ def read_power_supplies(
         if str(PowerContext.BMC) not in power_supplies:
             power_supplies[str(PowerContext.BMC)] = {}  # type: ignore[no-redef]
         for psu in self.get_power().get("PowerSupplies"):
-            # Both PSUs are named the same (HpeServerPowerSupply)
+            psu_position = str(psu["Oem"]["Hpe"]["BayNumber"])
+            # All PSUs are named the same (HpeServerPowerSupply)
             # Let's update it to have a unique name
-            name = psu["Name"] + str(psu["Oem"]["Hpe"]["BayNumber"])
-            psu_name = "PS" + str(psu["Oem"]["Hpe"]["BayNumber"])
-            super().add_monitoring_value(
-                cast(dict[str, dict[str, MonitorMetric]], power_supplies),
-                PowerContext.BMC,
-                Power(psu_name),
-                name,
-                psu["Oem"]["Hpe"]["AveragePowerOutputWatts"],
-            )
+            psu_status = psu.get("Status")
+            if psu_status:
+                psu_state = psu_status.get("State")
+                if psu_state:
+                    # We only consider healthy PSU
+                    if str(psu_state).lower() == "enabled":
+                        name = psu["Name"] + psu_position
+                        psu_name = "PS" + psu_position
+                        super().add_monitoring_value(
+                            cast(dict[str, dict[str, MonitorMetric]], power_supplies),
+                            PowerContext.BMC,
+                            Power(psu_name),
+                            name,
+                            psu["Oem"]["Hpe"]["AveragePowerOutputWatts"],
+                        )
+                    else:
+                        # Let's inform the user the PSU is reported as non healthy
+                        self.__warn_psu(
+                            psu_position,
+                            f'marked as {psu_state} in {psu_status.get("Health")} state',
+                        )
+                    continue
+
+            # Let's inform the user that no status was found, maybe a parsing or fw issue ?
+            self.__warn_psu(psu_position, "no status or state found !")
 
         return power_supplies
 

From d79bb7fe00a6cf62b9cdb80ce87b9ce004336e5c Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Wed, 23 Oct 2024 15:21:05 +0200
Subject: [PATCH 04/10] hpe: Remove redundant calls to enclosurechassis
 endpoint

On mono-node HPE chassis, the current code is calling the
enclosurechassis endpoint for each monitoring loop.

This call is perfectly useless since the enclosurechassis does not
exists.

This patch is about adding a is_multinode_chassis() function to be
cached that try this endpoint and get a constant answer.

Regarding this cached status, a real read of the enclosurechassis is
done for multinode chassis only.

Tested on DL380, ilorest 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.03'
Tested on XL225n (Apollo 2000 Gen10), iloreset 4.9.0.0, Model: 'iLO 5' FW: 'iLO 5 v3.08'
Tested on DL340Gen12, ilorest 4.9.0.0, Model: 'iLO 6' FW: 'iLO 6 v1.62'

Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 hwbench/environment/vendors/hpe/hpe.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/hwbench/environment/vendors/hpe/hpe.py b/hwbench/environment/vendors/hpe/hpe.py
index 9293490..f6b939a 100644
--- a/hwbench/environment/vendors/hpe/hpe.py
+++ b/hwbench/environment/vendors/hpe/hpe.py
@@ -168,11 +168,23 @@ def read_power_consumption(
             )
         return power_consumption
 
-    def get_oem_chassis(self):
-        return self.get_redfish_url(
-            "/redfish/v1/Chassis/enclosurechassis/", log_failure=False
+    @cache
+    def is_multinode_chassis(self) -> bool:
+        return (
+            True
+            if self.get_redfish_url(
+                "/redfish/v1/Chassis/enclosurechassis/", log_failure=False
+            )
+            else False
         )
 
+    def get_oem_chassis(self):
+        if self.is_multinode_chassis():
+            return self.get_redfish_url(
+                "/redfish/v1/Chassis/enclosurechassis/", log_failure=False
+            )
+        return {}
+
 
 class Hpe(Vendor):
     def __init__(self, out_dir, dmi, monitoring_config_filename):

From 47bc82b7f38b360d503df5a21a0d48c0d7bb98eb Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Wed, 23 Oct 2024 18:53:02 +0200
Subject: [PATCH 05/10] monitoring: Increasing turbostat time budget

By default, turbostat was given 200ms to start & complete.
When CPUs do not have hyperthreading and all cores are loaded by a
benchmark, like stress-ng, turbostat takes more than 200ms.

As a result, the  self.turbostat.parse() call in monitoring.py is
waiting turbostat to complete which generates a timing overdue.

This was generating traces like :
	hwbench: 1 jobs, 1 benchmarks, ETA 0h 01m 00s
	[full_cpu_load] stressng/cpu/matrixprod(M): 128 stressor on CPU [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127] for 60s
	Monitoring iteration 8 is 0.12ms late
	Monitoring iteration 9 is 0.47ms late
	Monitoring iteration 10 is 0.78ms late
	Monitoring iteration 11 is 1.19ms late
	Monitoring iteration 12 is 1.28ms late
	Monitoring iteration 13 is 1.59ms late
	Monitoring iteration 14 is 1.90ms late
	Monitoring iteration 15 is 2.22ms late
	Monitoring iteration 16 is 2.53ms late
	Monitoring iteration 17 is 2.85ms late
	Monitoring iteration 18 is 3.54ms late
	Monitoring iteration 19 is 3.48ms late
	Monitoring iteration 20 is 3.82ms late
	Monitoring iteration 21 is 4.15ms late
	Monitoring iteration 22 is 4.42ms late
	Monitoring iteration 23 is 4.71ms late
	Monitoring iteration 24 is 5.02ms late
	Monitoring iteration 25 is 5.57ms late
	Monitoring iteration 26 is 5.66ms late
	Monitoring iteration 27 is 5.89ms late
	Monitoring iteration 28 is 5.77ms late

This patch is lamely increasing the time budget to 500ms.
On the affeacted machines, it solved the issue.

Maybe at some point we'll have to increase the 'precision' window to get
a better ratio time budget vs run time.

Tested on Intel(R) Xeon(R) 6756E systems.

Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 hwbench/bench/monitoring.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hwbench/bench/monitoring.py b/hwbench/bench/monitoring.py
index 9507180..c43376b 100644
--- a/hwbench/bench/monitoring.py
+++ b/hwbench/bench/monitoring.py
@@ -192,8 +192,8 @@ def next_iter():
             start_time = self.get_monotonic_clock()
             if self.turbostat:
                 # Turbostat will run for the whole duration of this loop
-                # We just retract a 2/10th of second to ensure it will not overdue
-                self.turbostat.run(interval=(precision - 0.2))
+                # We just retract a 5/10th of second to ensure it will not overdue
+                self.turbostat.run(interval=(precision - 0.5))
                 # Let's monitor the time spent at monitoring the CPU
                 self.get_metric(Metrics.MONITOR)["CPU"]["Polling"].add(
                     (self.get_monotonic_clock() - start_time) * 1e-6

From 503e5bae32200f7a7226945d1feaa62d62a2428e Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Fri, 25 Oct 2024 11:40:44 +0200
Subject: [PATCH 06/10] hwgraph: Do not crash with empty monitoring metrics

If a monitoring metric is empty, do not crash hwgraph but rather display
a warning.

This was seen in issue #35.

The following metric was causing the issue :
  "PDU": {
    "Polling": {
      "name": "Polling",
      "unit": "ms",
      "mean": [],
      "min": [],
      "max": [],
      "stdev": [],
      "samples": []
    }
  },

With this commit a typical output looks like the following:

  environment: rendering 131 jobs from scaleway/hwbench-out-20241023125613/results.json (DL320)
  avx_1: No samples found in PDU.Polling, ignoring metric.
  avx_10: No samples found in PDU.Polling, ignoring metric.
  avx_11: No samples found in PDU.Polling, ignoring metric.
  avx_12: No samples found in PDU.Polling, ignoring metric.
  avx_13: No samples found in PDU.Polling, ignoring metric.

Reported-by: Elyes ZEKRI <ezekri@scaleway.com>
Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 graph/hwgraph.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/graph/hwgraph.py b/graph/hwgraph.py
index 798a8d3..27d5334 100755
--- a/graph/hwgraph.py
+++ b/graph/hwgraph.py
@@ -94,14 +94,20 @@ def graph_monitoring_metrics(args, trace: Trace, bench_name: str, output_dir) ->
         metrics = bench.get_component(Metrics.MONITOR, metric_name)
         if metrics:
             for metric in metrics:
-                rendered_graphs += yerr_graph(
-                    args,
-                    output_dir,
-                    bench,
-                    Metrics.MONITOR,
-                    metrics[metric],
-                    prefix=f"{metric_name}.",
-                )
+                # If a metric has no measure, let's ignore it
+                if len(metrics[metric].get_samples()) == 0:
+                    print(
+                        f"{bench_name}: No samples found in {metric_name}.{metric}, ignoring metric."
+                    )
+                else:
+                    rendered_graphs += yerr_graph(
+                        args,
+                        output_dir,
+                        bench,
+                        Metrics.MONITOR,
+                        metrics[metric],
+                        prefix=f"{metric_name}.",
+                    )
 
     return rendered_graphs
 

From c7669f9cb362a13259131a4ff0a10c0fa4bc6fde Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Fri, 25 Oct 2024 12:44:48 +0200
Subject: [PATCH 07/10] hwgraph: Adding metric get_full_name()

When loading metrics, it appear that some can have the same name making
invalid data, data name and misleading graphs.

This was seen in issue #36.

The BMC reported the two following structures :

          "CPU": {
            "02-CPU 1 PkgTmp": {
              "name": "CPU1",
              "unit": "Celsius",
              "mean": [
	  [...]
            "12-CPU 1": {
              "name": "CPU1",
              "unit": "Celsius",
              "mean": [

As we use the name member, we got the two different metrics being merged
as a single one.

This patch is adding a workaround by consider the parent's name as the
"full_name" of the metric.

So this patch is adding this value, like "12-CPU 1", as the full name
and reachable via get_full_name().

This commit ensures the env graphs are using this full name to avoid
smashing metrics.

Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 graph/graph.py                      | 10 +++++-----
 graph/trace.py                      |  2 +-
 hwbench/bench/monitoring_structs.py |  9 ++++++++-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/graph/graph.py b/graph/graph.py
index e1ce3c0..54ea058 100644
--- a/graph/graph.py
+++ b/graph/graph.py
@@ -260,9 +260,9 @@ def generic_graph(
         time_serie.append(time)
         # Collect all components mean value
         for component in components:
-            if component.get_name() not in data_serie:
-                data_serie[component.get_name()] = []
-            data_serie[component.get_name()].append(component.get_mean()[sample])
+            if component.get_full_name() not in data_serie:
+                data_serie[component.get_full_name()] = []
+            data_serie[component.get_full_name()].append(component.get_mean()[sample])
 
         if second_axis:
             for _, entry in bench.get_monitoring_metric(second_axis).items():
@@ -291,8 +291,8 @@ def generic_graph(
             graph.get_ax2().plot(x_serie, y2_serie, "", label=data2_item, marker="o")
 
     for component in components:
-        y_serie = np.array(data_serie[component.get_name()])[order]
-        graph.get_ax().plot(x_serie, y_serie, "", label=component.get_name())
+        y_serie = np.array(data_serie[component.get_full_name()])[order]
+        graph.get_ax().plot(x_serie, y_serie, "", label=component.get_full_name())
 
     graph.prepare_axes(
         30,
diff --git a/graph/trace.py b/graph/trace.py
index 7e8bbca..beed903 100644
--- a/graph/trace.py
+++ b/graph/trace.py
@@ -82,7 +82,7 @@ def load_monitoring(self):
                                 mm = Temperature(measure, original_measure["unit"])
                             else:
                                 mm = MonitorMetric(measure, original_measure["unit"])
-                            mm.load_from_dict(original_measure)
+                            mm.load_from_dict(original_measure, measure)
                             self.metrics[metric][component_family][measure] = mm
                 else:
                     fatal(f"Unexpected {metric} in monitoring")
diff --git a/hwbench/bench/monitoring_structs.py b/hwbench/bench/monitoring_structs.py
index 9997dab..b43ce24 100644
--- a/hwbench/bench/monitoring_structs.py
+++ b/hwbench/bench/monitoring_structs.py
@@ -26,7 +26,8 @@ def __post_init__(self):
             # If a value is given, let's add it
             self.add(self.value)
 
-    def load_from_dict(self, input: dict):
+    def load_from_dict(self, input: dict, full_name: str):
+        self.full_name = full_name
         self.name = str(input.get("name"))
         self.unit = str(input.get("unit"))
         self.mean = input.get("mean")  # type: ignore[assignment]
@@ -35,6 +36,12 @@ def load_from_dict(self, input: dict):
         self.stdev = input.get("stdev")  # type: ignore[assignment]
         self.samples = input.get("samples")  # type: ignore[assignment]
 
+    def get_full_name(self):
+        """Return the metric full name"""
+        if self.full_name:
+            return self.full_name
+        return self.name
+
     def get_name(self):
         """Return the metric name"""
         return self.name

From 30848178ca583e41d0531202113b8fd37f650226 Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Fri, 25 Oct 2024 14:46:19 +0200
Subject: [PATCH 08/10] hwbench: Using absolute paths for out & tuning
 directories

As per issue #38, the dmidecode call to save the SMBIOS table in a
binary file for later reading is broken as the file is not present into
the hwbench-out directory.

It appear that when dmidecode --dump-file is called, that is done with a
relative path.

The stderr log show the following error :
	hwbench-out-20241025123459/dmidecode.bin: open: No such file or directory

It appear the relative path passed to dmidecode is not existing because
dmidecode is not started in the same home dir as we are currently
running the tool.

This commit is about using an absolute path instead of a relative one to
ensure that every command/tool can dump content in the proper directory.

With this patch, the dmidecode-bin file is now present in the
hwbench-out archive.

Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 hwbench/hwbench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hwbench/hwbench.py b/hwbench/hwbench.py
index 3f2fb9a..9124743 100755
--- a/hwbench/hwbench.py
+++ b/hwbench/hwbench.py
@@ -62,7 +62,7 @@ def create_output_directory() -> tuple[pathlib.Path, pathlib.Path]:
     tuning_out_dir = out_dir / "tuning"
     tuning_out_dir.mkdir()
 
-    return out_dir, tuning_out_dir
+    return out_dir.absolute(), tuning_out_dir.absolute()
 
 
 def parse_options():

From f8c8e15422550a3c5a52b4e0ca2fdbe192f137fc Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Fri, 25 Oct 2024 15:36:59 +0200
Subject: [PATCH 09/10] hwgraph: Add option to manage missing data points.

As reported in issue #40, when data points are missing,
hwgraph was crashing because graphs must have the same
number of data points.

In such case, let's default to a fatal message to the user instead of
crahsing.

This commit also add a new hwgraph option, --ignore-missing-datapoint
that can be used to define a behavior to ignore this case and graph.

  --ignore-missing-datapoint=zero will replace the missing datapoint by a 0 value.
  --ignore-missing-datapoint=last will replace the missing datapoint by the last known value of the serie.

It's up to the user to decide which one is the best but also understand
that graphs will be partially wrong.

By default, hwgraph generates a fatal error.

Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 graph/graph.py   | 25 ++++++++++++++++++++++++-
 graph/hwgraph.py |  7 +++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/graph/graph.py b/graph/graph.py
index 54ea058..a35598b 100644
--- a/graph/graph.py
+++ b/graph/graph.py
@@ -262,7 +262,30 @@ def generic_graph(
         for component in components:
             if component.get_full_name() not in data_serie:
                 data_serie[component.get_full_name()] = []
-            data_serie[component.get_full_name()].append(component.get_mean()[sample])
+            # If we are missing some datapoints ....
+            if len(component.get_mean()) <= sample:
+                # If the user didn't explictely agreed to be replaced by 0, let's be fatal
+                if not args.ignore_missing_datapoint:
+                    fatal(
+                        f"{trace.get_name()}/{bench.get_bench_name()}: {component.get_full_name()} is missing the {sample+1}th data point.\
+ Use --ignore-missing-datapoint to ignore this case. Generated graphs will be partially incorrect."
+                    )
+                else:
+                    # User is fine with a missing data to be replaced.
+                    # Let's do that so we can render things properly.
+
+                    # Let's pick the last known value
+                    if args.ignore_missing_datapoint == "last":
+                        data_serie[component.get_full_name()].append(
+                            component.get_mean()[-1]
+                        )
+                    else:
+                        # Replace it by a zero
+                        data_serie[component.get_full_name()].append(0)
+            else:
+                data_serie[component.get_full_name()].append(
+                    component.get_mean()[sample]
+                )
 
         if second_axis:
             for _, entry in bench.get_monitoring_metric(second_axis).items():
diff --git a/graph/hwgraph.py b/graph/hwgraph.py
index 27d5334..4a5beb4 100755
--- a/graph/hwgraph.py
+++ b/graph/hwgraph.py
@@ -350,6 +350,13 @@ def main():
         action="store_true",
         help="Enable verbose mode",
     )
+
+    parser_graph.add_argument(
+        "--ignore-missing-datapoint",
+        choices=["zero", "last"],
+        default="",
+        help="Replace a missing datapoint instead of stopping the rendering. Could be by a zero or the last known value.",
+    )
     parser_graph.set_defaults(func=render_traces)
 
     parser_list = subparsers.add_parser(

From eef217ae24802247ec3dd15bf4c146b71e2d98ef Mon Sep 17 00:00:00 2001
From: Erwan Velu <e.velu@criteo.com>
Date: Fri, 25 Oct 2024 16:49:51 +0200
Subject: [PATCH 10/10] graph: Adding CPU clock scaling graph

As per issue #39, it could be super useful to have a new type of graph
to represent how the CPU clock performed during a benchmark.

This new graph gives a quick overview on how the CPU frequency behave
during a scaling benchmark.

The rendering is not as precise as environment graphs, but it gives a
brief overview with the following trade-offs for the y-err bars:
- min of the yerr-bar, is the min of min values
- mean of the yerr-bar, is the mean of mean values
- max of the yerr-bar, is the max of the max values

Signed-off-by: Erwan Velu <e.velu@criteo.com>
---
 graph/graph.py   |  2 +-
 graph/scaling.py | 25 +++++++++++++++++++++++++
 graph/trace.py   | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/graph/graph.py b/graph/graph.py
index a35598b..cebfaac 100644
--- a/graph/graph.py
+++ b/graph/graph.py
@@ -17,7 +17,7 @@ def init_matplotlib(args):
         fatal(f"Cannot load matplotlib backend engine {args.engine}")
 
 
-GRAPH_TYPES = ["perf", "perf_watt", "watts"]
+GRAPH_TYPES = ["perf", "perf_watt", "watts", "cpu_clock"]
 
 
 class Graph:
diff --git a/graph/scaling.py b/graph/scaling.py
index 78af7f9..bb448b8 100644
--- a/graph/scaling.py
+++ b/graph/scaling.py
@@ -25,6 +25,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
         aggregated_perfs_watt = {}  # type: dict[str, dict[str, Any]]
         aggregated_watt = {}  # type: dict[str, dict[str, Any]]
         aggregated_watt_err = {}  # type: dict[str, dict[str, Any]]
+        aggregated_cpu_clock = {}  # type: dict[str, dict[str, Any]]
+        aggregated_cpu_clock_err = {}  # type: dict[str, dict[str, Any]]
         workers = {}  # type: dict[str, list]
         logical_core_per_worker = []
         perf_list, unit = benches[emp]["metrics"]
@@ -41,6 +43,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                 aggregated_perfs_watt[perf] = {}
                 aggregated_watt[perf] = {}
                 aggregated_watt_err[perf] = {}
+                aggregated_cpu_clock[perf] = {}
+                aggregated_cpu_clock_err[perf] = {}
             # For every trace file given at the command line
             for trace in args.traces:
                 workers[trace.get_name()] = []
@@ -63,6 +67,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                         aggregated_perfs_watt[perf][trace.get_name()] = []
                         aggregated_watt[perf][trace.get_name()] = []
                         aggregated_watt_err[perf][trace.get_name()] = []
+                        aggregated_cpu_clock[perf][trace.get_name()] = []
+                        aggregated_cpu_clock_err[perf][trace.get_name()] = []
 
                     bench.add_perf(
                         perf,
@@ -70,6 +76,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                         perf_watt=aggregated_perfs_watt[perf][trace.get_name()],
                         watt=aggregated_watt[perf][trace.get_name()],
                         watt_err=aggregated_watt_err[perf][trace.get_name()],
+                        cpu_clock=aggregated_cpu_clock[perf][trace.get_name()],
+                        cpu_clock_err=aggregated_cpu_clock_err[perf][trace.get_name()],
                     )
 
         # Let's render all graphs types
@@ -94,6 +102,13 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                     outfile = f"scaling_watt_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
                     y_label = "Watts"
                     y_source = aggregated_watt
+                elif "cpu_clock" in graph_type:
+                    graph_type_title = (
+                        f"Scaling {graph_type}: {args.traces[0].get_metric_name()}"
+                    )
+                    outfile = f"scaling_cpu_clock_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
+                    y_label = "Mhz"
+                    y_source = aggregated_cpu_clock
                 else:
                     graph_type_title = (
                         f"Scaling {graph_type}: {bench.get_title_engine_name()}"
@@ -164,6 +179,16 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                             capsize=4,
                             label=trace_name,
                         )
+                    elif y_source == aggregated_cpu_clock:
+                        graph.get_ax().errorbar(
+                            x_serie,
+                            y_serie,
+                            yerr=np.array(aggregated_cpu_clock_err[perf][trace_name]).T,
+                            ecolor=e_color,
+                            color=color_name,
+                            capsize=4,
+                            label=trace_name,
+                        )
                     else:
                         graph.get_ax().plot(
                             x_serie,
diff --git a/graph/trace.py b/graph/trace.py
index beed903..36fb7ae 100644
--- a/graph/trace.py
+++ b/graph/trace.py
@@ -232,6 +232,8 @@ def add_perf(
         perf_watt=None,
         watt=None,
         watt_err=None,
+        cpu_clock=None,
+        cpu_clock_err=None,
         index=None,
     ) -> None:
         """Extract performance and power efficiency"""
@@ -316,6 +318,41 @@ def add_perf(
                         watt_err.append(metric)
                     else:
                         watt_err[index] = metric
+
+            if cpu_clock is not None:
+                mm = self.get_monitoring_metric(Metrics.FREQ)
+                mean_values = []
+                min_values = []
+                max_values = []
+
+                for freq_metric in mm:
+                    if freq_metric != "CPU":
+                        continue
+                    # We have to compute metrics of all systems cores
+                    for core in mm[freq_metric]:
+                        # MIN of min ?
+                        # Mean of mean ?
+                        # Max of max ?
+                        min_values.append(min(mm[freq_metric][core].get_min()))
+                        mean_values.append(mean(mm[freq_metric][core].get_mean()))
+                        max_values.append(max(mm[freq_metric][core].get_max()))
+                    min_value = min(min_values)
+                    mean_value = mean(mean_values)
+                    max_value = max(max_values)
+
+                if index is None:
+                    cpu_clock.append(mean_value)
+                else:
+                    cpu_clock[index] = mean_value
+
+                # If we want to keep the error distribution to plot error bars
+                if cpu_clock_err is not None:
+                    metric = (mean_value - min_value, max_value - mean_value)
+                    if index is None:
+                        cpu_clock_err.append(metric)
+                    else:
+                        cpu_clock_err[index] = metric
+
         except ValueError:
             fatal(f"No {perf} found in {self.get_bench_name()}")