Merge pull request #20 from criteo/monito

Improving the monitoring portability
criteo · May 31, 2024 · d2bc303 · d2bc303
2 parents 9f9333c + da3e8ad
commit d2bc303
Show file tree

Hide file tree

Showing 12 changed files with 132 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ Running the **simple.conf** job:
 ## Mandatory
 - python >= 3.9
 - [python dependencies](./requirements/base.in)
-- turbostat >= 2022.07.28
+- turbostat >= 2022.04.16
 - numactl
 - dmidecode
 - util-linux >= 2.32
@@ -75,4 +75,4 @@ Running the **simple.conf** job:
 ## Optional
 - ipmitool
 - ilorest (for HPE servers)
-- stress-ng >= 0.17.04
+- stress-ng >= 0.17.04
diff --git a/hwbench/bench/engine.py b/hwbench/bench/engine.py
@@ -3,6 +3,7 @@
 from typing import Optional
 
 from ..utils.external import External
+from ..utils.helpers import fatal
 from .parameters import BenchmarkParameters
 
 
@@ -47,6 +48,12 @@ def __init__(
         self.engine_name = name
         self.binary = binary
         self.modules = modules
+        # FIXME: If the import is done at the file level, the mocking is lost here
+        # So I'm importing is_binary_available just before the call :/
+        from ..utils.helpers import is_binary_available
+
+        if not is_binary_available(self.binary):
+            fatal(f"Engine {name} requires '{binary}' binary, please install it.")
 
     def get_binary(self) -> str:
         return self.binary

diff --git a/hwbench/bench/monitoring.py b/hwbench/bench/monitoring.py
@@ -57,52 +57,52 @@ def __set_metric(self, metric: Metrics, value: dict[str, dict[str, MonitorMetric
 
     def prepare(self):
         """Preparing the monitoring"""
-        # Let's be sure the monitoring is functional by
-        # - checking the BMC is actually connected to the network
-        if self.vendor.get_bmc().get_ip() == "0.0.0.0":
-            h.fatal("BMC has no IP, monitoring will not be possible")
-        print(
-            f"Starting monitoring for {self.vendor.name()} vendor with {self.vendor.get_bmc().get_ip()}"
-        )
+        v = self.vendor
+        bmc = self.vendor.get_bmc()
 
-        def check_monitoring(metric: Metrics):
+        def check_monitoring(source: str, metric: Metrics):
             data = self.get_metric(metric)
             if not len(data):
                 h.fatal(f"Cannot detect {str(metric)} metrics")
 
             print(
-                f"Monitoring {str(metric)} metrics:"
+                f"Monitoring/{source}: {str(metric)} metrics:"
                 + ", ".join(
                     [f"{len(data[pc])}x{pc}" for pc in data if len(data[pc]) > 0]
                 )
             )
 
         # - checking if the CPU monitoring works
         if self.hardware.cpu.get_arch() == "x86_64":
+            print("Monitoring/turbostat: initialize")
             self.turbostat = Turbostat(
                 self.hardware,
                 self.get_metric(Metrics.FREQ),
                 self.get_metric(Metrics.POWER_CONSUMPTION),
             )
-            check_monitoring(Metrics.FREQ)
+            check_monitoring("turbostat", Metrics.FREQ)
+
+        print(
+            f"Monitoring/BMC: initialize {v.name()} vendor with {bmc.get_driver_name()} driver @ {bmc.get_ip()}"
+        )
 
         # - checking if the bmc monitoring works
         # These calls will also initialize the datastructures out of the monitoring loop
         self.vendor.get_bmc().read_thermals(self.get_metric(Metrics.THERMAL))
-        check_monitoring(Metrics.THERMAL)
+        check_monitoring("BMC", Metrics.THERMAL)
 
         self.vendor.get_bmc().read_fans(self.get_metric(Metrics.FANS))
-        check_monitoring(Metrics.FANS)
+        check_monitoring("BMC", Metrics.FANS)
 
         self.vendor.get_bmc().read_power_consumption(
             self.get_metric(Metrics.POWER_CONSUMPTION)
         )
-        check_monitoring(Metrics.POWER_CONSUMPTION)
+        check_monitoring("BMC", Metrics.POWER_CONSUMPTION)
 
         self.vendor.get_bmc().read_power_supplies(
             self.get_metric(Metrics.POWER_SUPPLIES)
         )
-        check_monitoring(Metrics.POWER_SUPPLIES)
+        check_monitoring("BMC", Metrics.POWER_SUPPLIES)
 
     def __monitor_bmc(self):
         """Monitor the bmc metrics"""

diff --git a/hwbench/bench/test_benchmarks_common.py b/hwbench/bench/test_benchmarks_common.py
@@ -59,11 +59,15 @@ def get_benches(self):
 
     def parse_jobs_config(self, validate_parameters=True):
         # We need to mock turbostat when parsing config with monitoring
-        # We mock the run() command to get a constant output
-        with patch("hwbench.environment.turbostat.Turbostat.run") as ts:
-            with open("tests/parsing/turbostat/run", "r") as f:
-                ts.return_value = ast.literal_eval(f.read())
-                return self.benches.parse_jobs_config(validate_parameters)
+        with patch("hwbench.utils.helpers.is_binary_available") as iba:
+            iba.return_value = True
+            # We mock the run() and check_version() command to get a constant output
+            with patch("hwbench.environment.turbostat.Turbostat.check_version") as cv:
+                cv.return_value = True
+                with patch("hwbench.environment.turbostat.Turbostat.run") as ts:
+                    with open("tests/parsing/turbostat/run", "r") as f:
+                        ts.return_value = ast.literal_eval(f.read())
+                        return self.benches.parse_jobs_config(validate_parameters)
 
     def get_jobs_config(self) -> config.Config:
         return self.jobs_config

diff --git a/hwbench/config/test_parse.py b/hwbench/config/test_parse.py
@@ -47,7 +47,9 @@ def test_keywords(self):
                     .read_bytes()
                     .split(b":", 1)
                 )
-                self.get_jobs_config().validate_sections()
+                with patch("hwbench.utils.helpers.is_binary_available") as iba:
+                    iba.return_value = True
+                    self.get_jobs_config().validate_sections()
         except Exception as exc:
             assert False, f"'validate_sections' detected a syntax error {exc}"
 

diff --git a/hwbench/engines/test_parse.py b/hwbench/engines/test_parse.py
@@ -22,13 +22,17 @@
 def mock_engine(version: str) -> StressNG:
     # We need to patch list_module_parameters() function
     # to avoid considering the local stress-ng binary
-    with patch("hwbench.engines.stressng.EngineModuleCpu.list_module_parameters") as p:
-        p.return_value = (
-            pathlib.Path(f"./tests/parsing/stressngmethods/{version}/stdout")
-            .read_bytes()
-            .split(b":", 1)
-        )
-        return StressNG()
+    with patch("hwbench.utils.helpers.is_binary_available") as iba:
+        iba.return_value = True
+        with patch(
+            "hwbench.engines.stressng.EngineModuleCpu.list_module_parameters"
+        ) as p:
+            p.return_value = (
+                pathlib.Path(f"./tests/parsing/stressngmethods/{version}/stdout")
+                .read_bytes()
+                .split(b":", 1)
+            )
+            return StressNG()
 
 
 class TestParse(unittest.TestCase):

diff --git a/hwbench/environment/hardware.py b/hwbench/environment/hardware.py
@@ -52,6 +52,7 @@ def dump(self) -> dict[str, Optional[str | int] | dict]:
         return {
             "dmi": self.dmi.dump(),
             "cpu": self.cpu.dump(),
+            "bmc": self.vendor.get_bmc().dump(),
         }
 
     def cpu_flags(self) -> list[str]:

diff --git a/hwbench/environment/turbostat.py b/hwbench/environment/turbostat.py
@@ -1,8 +1,11 @@
 import os
+import re
 import subprocess
 from enum import Enum
+from packaging.version import Version
 from ..environment.hardware import BaseHardware
 from ..bench.monitoring_structs import MonitorMetric, CPUContext, PowerContext
+from ..utils.helpers import is_binary_available, fatal
 
 CORE = "core"
 PACKAGE = "package"
@@ -48,16 +51,49 @@ def __init__(
             CPUSTATS.CORE_WATTS,
             CPUSTATS.PACKAGE_WATTS,
         }
+        self.min_release = Version("2022.04.16")
         self.header = ""
         self.freq_metrics = freq_metrics
         self.power_metrics = power_metrics
         self.hardware = hardware
         self.process: subprocess.Popen[bytes] = None  # type: ignore[assignment]
         self.freq_metrics[str(CPUContext.CPU)] = {}  # type: ignore[no-redef]
         self.power_metrics[str(PowerContext.CPU)] = {}  # type: ignore[no-redef]
+
         # Let's make a first quick run to detect system
+        self.check_version()
         self.pre_run()
 
+    def check_version(self):
+        english_env = os.environ.copy()
+        english_env["LC_ALL"] = "C"
+
+        if not is_binary_available("turbostat"):
+            fatal("Missing turbostat binary, please install it.")
+
+        self.process = subprocess.Popen(
+            ["turbostat", "--version"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            env=english_env,
+            stdin=subprocess.DEVNULL,
+        )
+        # turbostat version 2022.04.16 - Len Brown <lenb@kernel.org>
+        match = re.search(
+            r"turbostat version (?P<version>[0-9]+\.[0-9]+\.[0-9]+).*",
+            str(self.get_process_output()),
+        )
+
+        current_version = Version(match.group("version"))
+        if not match:
+            fatal("Monitoring/turbostat: Cannot detect turbostat version")
+
+        print(f"Monitoring/turbostat: Detected release {current_version}")
+        if current_version < self.min_release:
+            fatal(
+                f"Monitoring/turbostat: minimal expected release is {self.min_release}"
+            )
+
     def reset_metrics(self, power_metrics=None):
         if power_metrics is not None:
             self.power_metrics = power_metrics
@@ -115,14 +151,14 @@ def run(self, interval: float = 1, wait=False):
             "-c",
             f"{self.hardware.get_cpu().get_logical_cores_count()-1}",
             "turbostat",
-            "-c",
+            "--cpu",
             "core",
-            "-q",
+            "--quiet",
             "--interval",
             str(interval),
-            "-n",
+            "--num_iterations",
             "1",
-            "-s",
+            "--show",
         ]
         sensors = ""
         for sensor in CPUSTATS:
@@ -189,9 +225,17 @@ def parse(self):
             items = line.split()
             core_nb = items[int(self.__get_field_position(CPUSTATS.CPU))]
             if self.has(CPUSTATS.CORE_WATTS):
-                self.power_metrics[str(PowerContext.CPU)][f"Core_{core_nb}"].add(
-                    float(items[int(self.__get_field_position(CPUSTATS.CORE_WATTS))])
-                )
+                try:
+                    self.power_metrics[str(PowerContext.CPU)][f"Core_{core_nb}"].add(
+                        float(
+                            items[int(self.__get_field_position(CPUSTATS.CORE_WATTS))]
+                        )
+                    )
+                except IndexError:
+                    # Some processors reports the corewatt in the header but not for all cores ...
+                    # So let's ignore if the metrics does not exist for this core
+                    pass
+
             self.freq_metrics[str(CPUContext.CPU)][f"Core_{core_nb}"].add(
                 float(items[int(self.__get_field_position(CPUSTATS.BUSY_MHZ))])
             )

diff --git a/hwbench/environment/vendors/vendor.py b/hwbench/environment/vendors/vendor.py
@@ -64,6 +64,14 @@ def get_ip(self) -> str:
 
         return ip
 
+    def get_driver_name(self) -> str:
+        """Return the BMC driver name"""
+        return type(self).__name__
+
+    def dump(self) -> dict[str, str]:
+        """Return the dump of the BMC"""
+        return {"driver": self.get_driver_name()}
+
     def connect_redfish(self):
         """Connect to the BMC using Redfish."""
         if not self.vendor.get_monitoring_config_filename():

diff --git a/hwbench/tuning/scheduler.py b/hwbench/tuning/scheduler.py
@@ -15,6 +15,9 @@ def run(self):
             for dirname in dirnames:
                 diskdir = pathlib.Path(rootpath) / dirname
                 file = diskdir / "queue/scheduler"
+                # Some block devices like zram do not have scheduler
+                if not os.path.isfile(file):
+                    continue
                 previous = file.read_text(encoding="utf-8").rstrip()
                 # see https://docs.kernel.org/block/switching-sched.html
                 # for deeper explanation

diff --git a/hwbench/utils/external.py b/hwbench/utils/external.py
@@ -2,6 +2,7 @@
 import pathlib
 import subprocess
 from abc import abstractmethod, ABC
+from .helpers import fatal
 
 
 class External(ABC):
@@ -39,24 +40,27 @@ def run(self):
         """Returns the output of parse_cmd (a json-able type)"""
         english_env = os.environ.copy()
         english_env["LC_ALL"] = "C"
-        if self.run_cmd_version():
-            ver = subprocess.run(
-                self.run_cmd_version(),
+        try:
+            if self.run_cmd_version():
+                ver = subprocess.run(
+                    self.run_cmd_version(),
+                    capture_output=True,
+                    cwd=self.out_dir,
+                    env=english_env,
+                    stdin=subprocess.DEVNULL,
+                )
+                self._write_output("version-stdout", ver.stdout)
+                self._write_output("version-stderr", ver.stderr)
+                self.parse_version(ver.stdout, ver.stderr)
+            out = subprocess.run(
+                self.run_cmd(),
                 capture_output=True,
                 cwd=self.out_dir,
                 env=english_env,
                 stdin=subprocess.DEVNULL,
             )
-            self._write_output("version-stdout", ver.stdout)
-            self._write_output("version-stderr", ver.stderr)
-            self.parse_version(ver.stdout, ver.stderr)
-        out = subprocess.run(
-            self.run_cmd(),
-            capture_output=True,
-            cwd=self.out_dir,
-            env=english_env,
-            stdin=subprocess.DEVNULL,
-        )
+        except FileNotFoundError as e:
+            fatal(f"Missing {e.filename} binary, please install it.")
         # save outputs
 
         self._write_output("stdout", out.stdout)

diff --git a/hwbench/utils/helpers.py b/hwbench/utils/helpers.py
@@ -2,6 +2,7 @@
 import logging
 import sys
 from datetime import timedelta
+from shutil import which
 from typing import NoReturn
 
 
@@ -24,3 +25,8 @@ def time_to_next_sync(safe_start=True):
     # Let's bump to the next minute o'clock
     next_sync += timedelta(seconds=60 - next_sync.second)
     return (next_sync - now).total_seconds(), next_sync
+
+
+def is_binary_available(binary_name: str) -> bool:
+    """A function to check if a binary is available"""
+    return which(binary_name) is not None