Skip to content

Commit

Permalink
Merge pull request #20 from criteo/monito
Browse files Browse the repository at this point in the history
Improving the monitoring portability
  • Loading branch information
ErwanAliasr1 authored May 31, 2024
2 parents 9f9333c + da3e8ad commit d2bc303
Show file tree
Hide file tree
Showing 12 changed files with 132 additions and 49 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ Running the **simple.conf** job:
## Mandatory
- python >= 3.9
- [python dependencies](./requirements/base.in)
- turbostat >= 2022.07.28
- turbostat >= 2022.04.16
- numactl
- dmidecode
- util-linux >= 2.32
Expand All @@ -75,4 +75,4 @@ Running the **simple.conf** job:
## Optional
- ipmitool
- ilorest (for HPE servers)
- stress-ng >= 0.17.04
- stress-ng >= 0.17.04
7 changes: 7 additions & 0 deletions hwbench/bench/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Optional

from ..utils.external import External
from ..utils.helpers import fatal
from .parameters import BenchmarkParameters


Expand Down Expand Up @@ -47,6 +48,12 @@ def __init__(
self.engine_name = name
self.binary = binary
self.modules = modules
# FIXME: If the import is done at the file level, the mocking is lost here
# So I'm importing is_binary_available just before the call :/
from ..utils.helpers import is_binary_available

if not is_binary_available(self.binary):
fatal(f"Engine {name} requires '{binary}' binary, please install it.")

def get_binary(self) -> str:
return self.binary
Expand Down
28 changes: 14 additions & 14 deletions hwbench/bench/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,52 +57,52 @@ def __set_metric(self, metric: Metrics, value: dict[str, dict[str, MonitorMetric

def prepare(self):
"""Preparing the monitoring"""
# Let's be sure the monitoring is functional by
# - checking the BMC is actually connected to the network
if self.vendor.get_bmc().get_ip() == "0.0.0.0":
h.fatal("BMC has no IP, monitoring will not be possible")
print(
f"Starting monitoring for {self.vendor.name()} vendor with {self.vendor.get_bmc().get_ip()}"
)
v = self.vendor
bmc = self.vendor.get_bmc()

def check_monitoring(metric: Metrics):
def check_monitoring(source: str, metric: Metrics):
data = self.get_metric(metric)
if not len(data):
h.fatal(f"Cannot detect {str(metric)} metrics")

print(
f"Monitoring {str(metric)} metrics:"
f"Monitoring/{source}: {str(metric)} metrics:"
+ ", ".join(
[f"{len(data[pc])}x{pc}" for pc in data if len(data[pc]) > 0]
)
)

# - checking if the CPU monitoring works
if self.hardware.cpu.get_arch() == "x86_64":
print("Monitoring/turbostat: initialize")
self.turbostat = Turbostat(
self.hardware,
self.get_metric(Metrics.FREQ),
self.get_metric(Metrics.POWER_CONSUMPTION),
)
check_monitoring(Metrics.FREQ)
check_monitoring("turbostat", Metrics.FREQ)

print(
f"Monitoring/BMC: initialize {v.name()} vendor with {bmc.get_driver_name()} driver @ {bmc.get_ip()}"
)

# - checking if the bmc monitoring works
# These calls will also initialize the datastructures out of the monitoring loop
self.vendor.get_bmc().read_thermals(self.get_metric(Metrics.THERMAL))
check_monitoring(Metrics.THERMAL)
check_monitoring("BMC", Metrics.THERMAL)

self.vendor.get_bmc().read_fans(self.get_metric(Metrics.FANS))
check_monitoring(Metrics.FANS)
check_monitoring("BMC", Metrics.FANS)

self.vendor.get_bmc().read_power_consumption(
self.get_metric(Metrics.POWER_CONSUMPTION)
)
check_monitoring(Metrics.POWER_CONSUMPTION)
check_monitoring("BMC", Metrics.POWER_CONSUMPTION)

self.vendor.get_bmc().read_power_supplies(
self.get_metric(Metrics.POWER_SUPPLIES)
)
check_monitoring(Metrics.POWER_SUPPLIES)
check_monitoring("BMC", Metrics.POWER_SUPPLIES)

def __monitor_bmc(self):
"""Monitor the bmc metrics"""
Expand Down
14 changes: 9 additions & 5 deletions hwbench/bench/test_benchmarks_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,15 @@ def get_benches(self):

def parse_jobs_config(self, validate_parameters=True):
# We need to mock turbostat when parsing config with monitoring
# We mock the run() command to get a constant output
with patch("hwbench.environment.turbostat.Turbostat.run") as ts:
with open("tests/parsing/turbostat/run", "r") as f:
ts.return_value = ast.literal_eval(f.read())
return self.benches.parse_jobs_config(validate_parameters)
with patch("hwbench.utils.helpers.is_binary_available") as iba:
iba.return_value = True
# We mock the run() and check_version() command to get a constant output
with patch("hwbench.environment.turbostat.Turbostat.check_version") as cv:
cv.return_value = True
with patch("hwbench.environment.turbostat.Turbostat.run") as ts:
with open("tests/parsing/turbostat/run", "r") as f:
ts.return_value = ast.literal_eval(f.read())
return self.benches.parse_jobs_config(validate_parameters)

def get_jobs_config(self) -> config.Config:
return self.jobs_config
Expand Down
4 changes: 3 additions & 1 deletion hwbench/config/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ def test_keywords(self):
.read_bytes()
.split(b":", 1)
)
self.get_jobs_config().validate_sections()
with patch("hwbench.utils.helpers.is_binary_available") as iba:
iba.return_value = True
self.get_jobs_config().validate_sections()
except Exception as exc:
assert False, f"'validate_sections' detected a syntax error {exc}"

Expand Down
18 changes: 11 additions & 7 deletions hwbench/engines/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,17 @@
def mock_engine(version: str) -> StressNG:
# We need to patch list_module_parameters() function
# to avoid considering the local stress-ng binary
with patch("hwbench.engines.stressng.EngineModuleCpu.list_module_parameters") as p:
p.return_value = (
pathlib.Path(f"./tests/parsing/stressngmethods/{version}/stdout")
.read_bytes()
.split(b":", 1)
)
return StressNG()
with patch("hwbench.utils.helpers.is_binary_available") as iba:
iba.return_value = True
with patch(
"hwbench.engines.stressng.EngineModuleCpu.list_module_parameters"
) as p:
p.return_value = (
pathlib.Path(f"./tests/parsing/stressngmethods/{version}/stdout")
.read_bytes()
.split(b":", 1)
)
return StressNG()


class TestParse(unittest.TestCase):
Expand Down
1 change: 1 addition & 0 deletions hwbench/environment/hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def dump(self) -> dict[str, Optional[str | int] | dict]:
return {
"dmi": self.dmi.dump(),
"cpu": self.cpu.dump(),
"bmc": self.vendor.get_bmc().dump(),
}

def cpu_flags(self) -> list[str]:
Expand Down
58 changes: 51 additions & 7 deletions hwbench/environment/turbostat.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import os
import re
import subprocess
from enum import Enum
from packaging.version import Version
from ..environment.hardware import BaseHardware
from ..bench.monitoring_structs import MonitorMetric, CPUContext, PowerContext
from ..utils.helpers import is_binary_available, fatal

CORE = "core"
PACKAGE = "package"
Expand Down Expand Up @@ -48,16 +51,49 @@ def __init__(
CPUSTATS.CORE_WATTS,
CPUSTATS.PACKAGE_WATTS,
}
self.min_release = Version("2022.04.16")
self.header = ""
self.freq_metrics = freq_metrics
self.power_metrics = power_metrics
self.hardware = hardware
self.process: subprocess.Popen[bytes] = None # type: ignore[assignment]
self.freq_metrics[str(CPUContext.CPU)] = {} # type: ignore[no-redef]
self.power_metrics[str(PowerContext.CPU)] = {} # type: ignore[no-redef]

# Let's make a first quick run to detect system
self.check_version()
self.pre_run()

def check_version(self):
english_env = os.environ.copy()
english_env["LC_ALL"] = "C"

if not is_binary_available("turbostat"):
fatal("Missing turbostat binary, please install it.")

self.process = subprocess.Popen(
["turbostat", "--version"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
env=english_env,
stdin=subprocess.DEVNULL,
)
# turbostat version 2022.04.16 - Len Brown <lenb@kernel.org>
match = re.search(
r"turbostat version (?P<version>[0-9]+\.[0-9]+\.[0-9]+).*",
str(self.get_process_output()),
)

current_version = Version(match.group("version"))
if not match:
fatal("Monitoring/turbostat: Cannot detect turbostat version")

print(f"Monitoring/turbostat: Detected release {current_version}")
if current_version < self.min_release:
fatal(
f"Monitoring/turbostat: minimal expected release is {self.min_release}"
)

def reset_metrics(self, power_metrics=None):
if power_metrics is not None:
self.power_metrics = power_metrics
Expand Down Expand Up @@ -115,14 +151,14 @@ def run(self, interval: float = 1, wait=False):
"-c",
f"{self.hardware.get_cpu().get_logical_cores_count()-1}",
"turbostat",
"-c",
"--cpu",
"core",
"-q",
"--quiet",
"--interval",
str(interval),
"-n",
"--num_iterations",
"1",
"-s",
"--show",
]
sensors = ""
for sensor in CPUSTATS:
Expand Down Expand Up @@ -189,9 +225,17 @@ def parse(self):
items = line.split()
core_nb = items[int(self.__get_field_position(CPUSTATS.CPU))]
if self.has(CPUSTATS.CORE_WATTS):
self.power_metrics[str(PowerContext.CPU)][f"Core_{core_nb}"].add(
float(items[int(self.__get_field_position(CPUSTATS.CORE_WATTS))])
)
try:
self.power_metrics[str(PowerContext.CPU)][f"Core_{core_nb}"].add(
float(
items[int(self.__get_field_position(CPUSTATS.CORE_WATTS))]
)
)
except IndexError:
# Some processors reports the corewatt in the header but not for all cores ...
# So let's ignore if the metrics does not exist for this core
pass

self.freq_metrics[str(CPUContext.CPU)][f"Core_{core_nb}"].add(
float(items[int(self.__get_field_position(CPUSTATS.BUSY_MHZ))])
)
Expand Down
8 changes: 8 additions & 0 deletions hwbench/environment/vendors/vendor.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ def get_ip(self) -> str:

return ip

def get_driver_name(self) -> str:
"""Return the BMC driver name"""
return type(self).__name__

def dump(self) -> dict[str, str]:
"""Return the dump of the BMC"""
return {"driver": self.get_driver_name()}

def connect_redfish(self):
"""Connect to the BMC using Redfish."""
if not self.vendor.get_monitoring_config_filename():
Expand Down
3 changes: 3 additions & 0 deletions hwbench/tuning/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ def run(self):
for dirname in dirnames:
diskdir = pathlib.Path(rootpath) / dirname
file = diskdir / "queue/scheduler"
# Some block devices like zram do not have scheduler
if not os.path.isfile(file):
continue
previous = file.read_text(encoding="utf-8").rstrip()
# see https://docs.kernel.org/block/switching-sched.html
# for deeper explanation
Expand Down
30 changes: 17 additions & 13 deletions hwbench/utils/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pathlib
import subprocess
from abc import abstractmethod, ABC
from .helpers import fatal


class External(ABC):
Expand Down Expand Up @@ -39,24 +40,27 @@ def run(self):
"""Returns the output of parse_cmd (a json-able type)"""
english_env = os.environ.copy()
english_env["LC_ALL"] = "C"
if self.run_cmd_version():
ver = subprocess.run(
self.run_cmd_version(),
try:
if self.run_cmd_version():
ver = subprocess.run(
self.run_cmd_version(),
capture_output=True,
cwd=self.out_dir,
env=english_env,
stdin=subprocess.DEVNULL,
)
self._write_output("version-stdout", ver.stdout)
self._write_output("version-stderr", ver.stderr)
self.parse_version(ver.stdout, ver.stderr)
out = subprocess.run(
self.run_cmd(),
capture_output=True,
cwd=self.out_dir,
env=english_env,
stdin=subprocess.DEVNULL,
)
self._write_output("version-stdout", ver.stdout)
self._write_output("version-stderr", ver.stderr)
self.parse_version(ver.stdout, ver.stderr)
out = subprocess.run(
self.run_cmd(),
capture_output=True,
cwd=self.out_dir,
env=english_env,
stdin=subprocess.DEVNULL,
)
except FileNotFoundError as e:
fatal(f"Missing {e.filename} binary, please install it.")
# save outputs

self._write_output("stdout", out.stdout)
Expand Down
6 changes: 6 additions & 0 deletions hwbench/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import sys
from datetime import timedelta
from shutil import which
from typing import NoReturn


Expand All @@ -24,3 +25,8 @@ def time_to_next_sync(safe_start=True):
# Let's bump to the next minute o'clock
next_sync += timedelta(seconds=60 - next_sync.second)
return (next_sync - now).total_seconds(), next_sync


def is_binary_available(binary_name: str) -> bool:
"""A function to check if a binary is available"""
return which(binary_name) is not None

0 comments on commit d2bc303

Please sign in to comment.