Merge pull request #34 from criteo/various

A collection of fixes to improve performance and stability
criteo · Oct 28, 2024 · a930da3 · a930da3
2 parents f2fa9c2 + eef217a
commit a930da3
Show file tree

Hide file tree

Showing 10 changed files with 193 additions and 36 deletions.
diff --git a/graph/graph.py b/graph/graph.py
@@ -17,7 +17,7 @@ def init_matplotlib(args):
         fatal(f"Cannot load matplotlib backend engine {args.engine}")
 
 
-GRAPH_TYPES = ["perf", "perf_watt", "watts"]
+GRAPH_TYPES = ["perf", "perf_watt", "watts", "cpu_clock"]
 
 
 class Graph:
@@ -260,9 +260,32 @@ def generic_graph(
         time_serie.append(time)
         # Collect all components mean value
         for component in components:
-            if component.get_name() not in data_serie:
-                data_serie[component.get_name()] = []
-            data_serie[component.get_name()].append(component.get_mean()[sample])
+            if component.get_full_name() not in data_serie:
+                data_serie[component.get_full_name()] = []
+            # If we are missing some datapoints ....
+            if len(component.get_mean()) <= sample:
+                # If the user didn't explictely agreed to be replaced by 0, let's be fatal
+                if not args.ignore_missing_datapoint:
+                    fatal(
+                        f"{trace.get_name()}/{bench.get_bench_name()}: {component.get_full_name()} is missing the {sample+1}th data point.\
+ Use --ignore-missing-datapoint to ignore this case. Generated graphs will be partially incorrect."
+                    )
+                else:
+                    # User is fine with a missing data to be replaced.
+                    # Let's do that so we can render things properly.
+
+                    # Let's pick the last known value
+                    if args.ignore_missing_datapoint == "last":
+                        data_serie[component.get_full_name()].append(
+                            component.get_mean()[-1]
+                        )
+                    else:
+                        # Replace it by a zero
+                        data_serie[component.get_full_name()].append(0)
+            else:
+                data_serie[component.get_full_name()].append(
+                    component.get_mean()[sample]
+                )
 
         if second_axis:
             for _, entry in bench.get_monitoring_metric(second_axis).items():
@@ -291,8 +314,8 @@ def generic_graph(
             graph.get_ax2().plot(x_serie, y2_serie, "", label=data2_item, marker="o")
 
     for component in components:
-        y_serie = np.array(data_serie[component.get_name()])[order]
-        graph.get_ax().plot(x_serie, y_serie, "", label=component.get_name())
+        y_serie = np.array(data_serie[component.get_full_name()])[order]
+        graph.get_ax().plot(x_serie, y_serie, "", label=component.get_full_name())
 
     graph.prepare_axes(
         30,

diff --git a/graph/hwgraph.py b/graph/hwgraph.py
@@ -94,14 +94,20 @@ def graph_monitoring_metrics(args, trace: Trace, bench_name: str, output_dir) ->
         metrics = bench.get_component(Metrics.MONITOR, metric_name)
         if metrics:
             for metric in metrics:
-                rendered_graphs += yerr_graph(
-                    args,
-                    output_dir,
-                    bench,
-                    Metrics.MONITOR,
-                    metrics[metric],
-                    prefix=f"{metric_name}.",
-                )
+                # If a metric has no measure, let's ignore it
+                if len(metrics[metric].get_samples()) == 0:
+                    print(
+                        f"{bench_name}: No samples found in {metric_name}.{metric}, ignoring metric."
+                    )
+                else:
+                    rendered_graphs += yerr_graph(
+                        args,
+                        output_dir,
+                        bench,
+                        Metrics.MONITOR,
+                        metrics[metric],
+                        prefix=f"{metric_name}.",
+                    )
 
     return rendered_graphs
 
@@ -344,6 +350,13 @@ def main():
         action="store_true",
         help="Enable verbose mode",
     )
+
+    parser_graph.add_argument(
+        "--ignore-missing-datapoint",
+        choices=["zero", "last"],
+        default="",
+        help="Replace a missing datapoint instead of stopping the rendering. Could be by a zero or the last known value.",
+    )
     parser_graph.set_defaults(func=render_traces)
 
     parser_list = subparsers.add_parser(

diff --git a/graph/scaling.py b/graph/scaling.py
@@ -25,6 +25,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
         aggregated_perfs_watt = {}  # type: dict[str, dict[str, Any]]
         aggregated_watt = {}  # type: dict[str, dict[str, Any]]
         aggregated_watt_err = {}  # type: dict[str, dict[str, Any]]
+        aggregated_cpu_clock = {}  # type: dict[str, dict[str, Any]]
+        aggregated_cpu_clock_err = {}  # type: dict[str, dict[str, Any]]
         workers = {}  # type: dict[str, list]
         logical_core_per_worker = []
         perf_list, unit = benches[emp]["metrics"]
@@ -41,6 +43,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                 aggregated_perfs_watt[perf] = {}
                 aggregated_watt[perf] = {}
                 aggregated_watt_err[perf] = {}
+                aggregated_cpu_clock[perf] = {}
+                aggregated_cpu_clock_err[perf] = {}
             # For every trace file given at the command line
             for trace in args.traces:
                 workers[trace.get_name()] = []
@@ -63,13 +67,17 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                         aggregated_perfs_watt[perf][trace.get_name()] = []
                         aggregated_watt[perf][trace.get_name()] = []
                         aggregated_watt_err[perf][trace.get_name()] = []
+                        aggregated_cpu_clock[perf][trace.get_name()] = []
+                        aggregated_cpu_clock_err[perf][trace.get_name()] = []
 
                     bench.add_perf(
                         perf,
                         traces_perf=aggregated_perfs[perf][trace.get_name()],
                         perf_watt=aggregated_perfs_watt[perf][trace.get_name()],
                         watt=aggregated_watt[perf][trace.get_name()],
                         watt_err=aggregated_watt_err[perf][trace.get_name()],
+                        cpu_clock=aggregated_cpu_clock[perf][trace.get_name()],
+                        cpu_clock_err=aggregated_cpu_clock_err[perf][trace.get_name()],
                     )
 
         # Let's render all graphs types
@@ -94,6 +102,13 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                     outfile = f"scaling_watt_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
                     y_label = "Watts"
                     y_source = aggregated_watt
+                elif "cpu_clock" in graph_type:
+                    graph_type_title = (
+                        f"Scaling {graph_type}: {args.traces[0].get_metric_name()}"
+                    )
+                    outfile = f"scaling_cpu_clock_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
+                    y_label = "Mhz"
+                    y_source = aggregated_cpu_clock
                 else:
                     graph_type_title = (
                         f"Scaling {graph_type}: {bench.get_title_engine_name()}"
@@ -164,6 +179,16 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                             capsize=4,
                             label=trace_name,
                         )
+                    elif y_source == aggregated_cpu_clock:
+                        graph.get_ax().errorbar(
+                            x_serie,
+                            y_serie,
+                            yerr=np.array(aggregated_cpu_clock_err[perf][trace_name]).T,
+                            ecolor=e_color,
+                            color=color_name,
+                            capsize=4,
+                            label=trace_name,
+                        )
                     else:
                         graph.get_ax().plot(
                             x_serie,

diff --git a/graph/trace.py b/graph/trace.py
@@ -82,7 +82,7 @@ def load_monitoring(self):
                                 mm = Temperature(measure, original_measure["unit"])
                             else:
                                 mm = MonitorMetric(measure, original_measure["unit"])
-                            mm.load_from_dict(original_measure)
+                            mm.load_from_dict(original_measure, measure)
                             self.metrics[metric][component_family][measure] = mm
                 else:
                     fatal(f"Unexpected {metric} in monitoring")
@@ -232,6 +232,8 @@ def add_perf(
         perf_watt=None,
         watt=None,
         watt_err=None,
+        cpu_clock=None,
+        cpu_clock_err=None,
         index=None,
     ) -> None:
         """Extract performance and power efficiency"""
@@ -316,6 +318,41 @@ def add_perf(
                         watt_err.append(metric)
                     else:
                         watt_err[index] = metric
+
+            if cpu_clock is not None:
+                mm = self.get_monitoring_metric(Metrics.FREQ)
+                mean_values = []
+                min_values = []
+                max_values = []
+
+                for freq_metric in mm:
+                    if freq_metric != "CPU":
+                        continue
+                    # We have to compute metrics of all systems cores
+                    for core in mm[freq_metric]:
+                        # MIN of min ?
+                        # Mean of mean ?
+                        # Max of max ?
+                        min_values.append(min(mm[freq_metric][core].get_min()))
+                        mean_values.append(mean(mm[freq_metric][core].get_mean()))
+                        max_values.append(max(mm[freq_metric][core].get_max()))
+                    min_value = min(min_values)
+                    mean_value = mean(mean_values)
+                    max_value = max(max_values)
+
+                if index is None:
+                    cpu_clock.append(mean_value)
+                else:
+                    cpu_clock[index] = mean_value
+
+                # If we want to keep the error distribution to plot error bars
+                if cpu_clock_err is not None:
+                    metric = (mean_value - min_value, max_value - mean_value)
+                    if index is None:
+                        cpu_clock_err.append(metric)
+                    else:
+                        cpu_clock_err[index] = metric
+
         except ValueError:
             fatal(f"No {perf} found in {self.get_bench_name()}")
 

diff --git a/hwbench/bench/monitoring.py b/hwbench/bench/monitoring.py
@@ -192,8 +192,8 @@ def next_iter():
             start_time = self.get_monotonic_clock()
             if self.turbostat:
                 # Turbostat will run for the whole duration of this loop
-                # We just retract a 2/10th of second to ensure it will not overdue
-                self.turbostat.run(interval=(precision - 0.2))
+                # We just retract a 5/10th of second to ensure it will not overdue
+                self.turbostat.run(interval=(precision - 0.5))
                 # Let's monitor the time spent at monitoring the CPU
                 self.get_metric(Metrics.MONITOR)["CPU"]["Polling"].add(
                     (self.get_monotonic_clock() - start_time) * 1e-6

diff --git a/hwbench/bench/monitoring_structs.py b/hwbench/bench/monitoring_structs.py
@@ -26,7 +26,8 @@ def __post_init__(self):
             # If a value is given, let's add it
             self.add(self.value)
 
-    def load_from_dict(self, input: dict):
+    def load_from_dict(self, input: dict, full_name: str):
+        self.full_name = full_name
         self.name = str(input.get("name"))
         self.unit = str(input.get("unit"))
         self.mean = input.get("mean")  # type: ignore[assignment]
@@ -35,6 +36,12 @@ def load_from_dict(self, input: dict):
         self.stdev = input.get("stdev")  # type: ignore[assignment]
         self.samples = input.get("samples")  # type: ignore[assignment]
 
+    def get_full_name(self):
+        """Return the metric full name"""
+        if self.full_name:
+            return self.full_name
+        return self.name
+
     def get_name(self):
         """Return the metric name"""
         return self.name

diff --git a/hwbench/environment/vendors/dell/dell.py b/hwbench/environment/vendors/dell/dell.py
@@ -11,6 +11,8 @@
 
 
 class IDRAC(BMC):
+    oem_endpoint = ""
+
     def get_thermal(self):
         return self.get_redfish_url("/redfish/v1/Chassis/System.Embedded.1/Thermal")
 
@@ -42,17 +44,27 @@ def get_power(self):
         return self.get_redfish_url("/redfish/v1/Chassis/System.Embedded.1/Power")
 
     def get_oem_system(self):
+        # If we already found the proper endpoint, let's reuse it.
+        if self.oem_endpoint:
+            return self.get_redfish_url(
+                self.oem_endpoint,
+                log_failure=False,
+            )
+
+        new_oem_endpoint = "/redfish/v1/Managers/iDRAC.Embedded.1/Oem/Dell/DellAttributes/System.Embedded.1"
         oem = self.get_redfish_url(
-            "/redfish/v1/Managers/iDRAC.Embedded.1/Oem/Dell/DellAttributes/System.Embedded.1",
+            new_oem_endpoint,
             log_failure=False,
         )
         # If not System.Embedded, let's use the default attributes
         if "Attributes" not in oem:
-            oem = self.get_redfish_url(
-                "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes"
-            )
+            new_oem_endpoint = "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes"
+            oem = self.get_redfish_url(new_oem_endpoint)
             if "Attributes" not in oem:
                 h.fatal("Cannot find Dell OEM metrics, please fill a bug.")
+
+        # Let's save the endpoint to avoid trying all of them at every run
+        self.oem_endpoint = new_oem_endpoint
         return oem
 
     def read_power_consumption(

diff --git a/hwbench/environment/vendors/hpe/hpe.py b/hwbench/environment/vendors/hpe/hpe.py
@@ -1,5 +1,7 @@
+import logging
 import pathlib
 import re
+from functools import cache
 from typing import cast
 
 from ....bench.monitoring_structs import (
@@ -84,24 +86,45 @@ def add(self, name):
     def get_power(self):
         return self.get_redfish_url("/redfish/v1/Chassis/1/Power/")
 
+    @cache
+    def __warn_psu(self, psu_number, message):
+        logging.error(f"PSU {psu_number}: {message}")
+
     def read_power_supplies(
         self, power_supplies: dict[str, dict[str, Power]] = {}
     ) -> dict[str, dict[str, Power]]:
         """Return power supplies power from server"""
         if str(PowerContext.BMC) not in power_supplies:
             power_supplies[str(PowerContext.BMC)] = {}  # type: ignore[no-redef]
         for psu in self.get_power().get("PowerSupplies"):
-            # Both PSUs are named the same (HpeServerPowerSupply)
+            psu_position = str(psu["Oem"]["Hpe"]["BayNumber"])
+            # All PSUs are named the same (HpeServerPowerSupply)
             # Let's update it to have a unique name
-            name = psu["Name"] + str(psu["Oem"]["Hpe"]["BayNumber"])
-            psu_name = "PS" + str(psu["Oem"]["Hpe"]["BayNumber"])
-            super().add_monitoring_value(
-                cast(dict[str, dict[str, MonitorMetric]], power_supplies),
-                PowerContext.BMC,
-                Power(psu_name),
-                name,
-                psu["Oem"]["Hpe"]["AveragePowerOutputWatts"],
-            )
+            psu_status = psu.get("Status")
+            if psu_status:
+                psu_state = psu_status.get("State")
+                if psu_state:
+                    # We only consider healthy PSU
+                    if str(psu_state).lower() == "enabled":
+                        name = psu["Name"] + psu_position
+                        psu_name = "PS" + psu_position
+                        super().add_monitoring_value(
+                            cast(dict[str, dict[str, MonitorMetric]], power_supplies),
+                            PowerContext.BMC,
+                            Power(psu_name),
+                            name,
+                            psu["Oem"]["Hpe"]["AveragePowerOutputWatts"],
+                        )
+                    else:
+                        # Let's inform the user the PSU is reported as non healthy
+                        self.__warn_psu(
+                            psu_position,
+                            f'marked as {psu_state} in {psu_status.get("Health")} state',
+                        )
+                    continue
+
+            # Let's inform the user that no status was found, maybe a parsing or fw issue ?
+            self.__warn_psu(psu_position, "no status or state found !")
 
         return power_supplies
 
@@ -145,11 +168,23 @@ def read_power_consumption(
             )
         return power_consumption
 
-    def get_oem_chassis(self):
-        return self.get_redfish_url(
-            "/redfish/v1/Chassis/enclosurechassis/", log_failure=False
+    @cache
+    def is_multinode_chassis(self) -> bool:
+        return (
+            True
+            if self.get_redfish_url(
+                "/redfish/v1/Chassis/enclosurechassis/", log_failure=False
+            )
+            else False
         )
 
+    def get_oem_chassis(self):
+        if self.is_multinode_chassis():
+            return self.get_redfish_url(
+                "/redfish/v1/Chassis/enclosurechassis/", log_failure=False
+            )
+        return {}
+
 
 class Hpe(Vendor):
     def __init__(self, out_dir, dmi, monitoring_config_filename):