Skip to content

Commit

Permalink
Merge pull request #24 from criteo/bmc_metrics
Browse files Browse the repository at this point in the history
Improving Bmc metrics collection:

fixing the broken logic in the mocked BMC that generated incorrect data structures
simplify the addition of a new metric logic to get common code across various BMC drivers
The merge was made with two commits to get a history of what was wrong in the mocked BMC driver and then the cleanup with a helper.

This merge also fixes the missing packaging python deps and adds a very simple job used during the development of this PR.
  • Loading branch information
anisse authored Jun 5, 2024
2 parents d2bc303 + 041c942 commit cb123d0
Show file tree
Hide file tree
Showing 8 changed files with 167 additions and 55 deletions.
14 changes: 14 additions & 0 deletions configs/mini.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# This configuration will :
# - load all cores with a matrixprod test during 15 sec.
[global]
runtime=15
monitor=all

[full_cpu_load]
engine=stressng
engine_module=cpu
engine_module_parameter=matrixprod
hosting_cpu_cores=all
hosting_cpu_cores_scaling=none
stressor_range=auto

47 changes: 24 additions & 23 deletions hwbench/environment/vendors/dell/dell.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import cast
from ....bench.monitoring_structs import (
MonitorMetric,
Power,
PowerCategories as PowerCat,
PowerContext,
Expand All @@ -19,11 +21,13 @@ def read_thermals(
continue
name = t["Name"].split("Temp")[0].strip()
pc = t["PhysicalContext"]
if pc not in thermals:
thermals[pc] = {}
if t["Name"] not in thermals[pc]:
thermals[pc][t["Name"]] = Temperature(name)
thermals[pc][t["Name"]].add(t["ReadingCelsius"])
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], thermals),
pc,
Temperature(name),
t["Name"],
t["ReadingCelsius"],
)
return thermals

def get_power(self):
Expand All @@ -43,28 +47,25 @@ def read_power_consumption(
# ServerPwr.1.SCViewSledPwr is computed from other metrics
# It includes the SLED power consumption + a mathematical portion of the chassis consumption
# It's computed like : ServerPwr.1.SCViewSledPwr = PowerConsumedWatts + 'SC-BMC.1.ChassisInfraPower / nb_servers'
if (
str(PowerCat.SERVERINCHASSIS)
not in power_consumption[str(PowerContext.BMC)]
):
power_consumption[str(PowerContext.BMC)][
str(PowerCat.SERVERINCHASSIS)
] = Power(str(PowerCat.SERVERINCHASSIS))
power_consumption[str(PowerContext.BMC)][str(PowerCat.SERVERINCHASSIS)].add(
oem_system["Attributes"]["ServerPwr.1.SCViewSledPwr"]
name = str(PowerCat.SERVERINCHASSIS)
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_consumption),
PowerContext.BMC,
Power(name),
name,
oem_system["Attributes"]["ServerPwr.1.SCViewSledPwr"],
)

if "SC-BMC.1.ChassisInfraPower" in oem_system["Attributes"]:
# SC-BMC.1.ChassisInfraPower reports the power consumption of the chassis infrastructure,
# not counting the SLEDs
if (
str(PowerCat.INFRASTRUCTURE)
not in power_consumption[str(PowerContext.BMC)]
):
power_consumption[str(PowerContext.BMC)][
str(PowerCat.INFRASTRUCTURE)
] = Power(str(PowerCat.INFRASTRUCTURE))
power_consumption[str(PowerContext.BMC)][str(PowerCat.INFRASTRUCTURE)].add(
oem_system["Attributes"]["SC-BMC.1.ChassisInfraPower"]
name = str(PowerCat.INFRASTRUCTURE)
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_consumption),
PowerContext.BMC,
Power(name),
name,
oem_system["Attributes"]["SC-BMC.1.ChassisInfraPower"],
)

# Let's add the sum of the power supplies to get the inlet power consumption
Expand Down
64 changes: 38 additions & 26 deletions hwbench/environment/vendors/hpe/hpe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import pathlib
import re
from typing import cast

from ....bench.monitoring_structs import (
MonitorMetric,
Power,
PowerCategories as PowerCat,
PowerContext,
Expand Down Expand Up @@ -28,8 +31,6 @@ def read_thermals(
if t["ReadingCelsius"] <= 0:
continue
pc = t["PhysicalContext"]
if pc not in thermals:
thermals[pc] = {}

# Temperature metrics are named like the following :
# 05-P1 DIMM 5-8
Expand All @@ -48,20 +49,24 @@ def read_thermals(
# 04-P1 DIMM 1-4
sd = f"{s}{d}"

def add(name):
if t["Name"] not in thermals[pc]:
thermals[pc][t["Name"]] = Temperature(name)
thermals[pc][t["Name"]].add(t["ReadingCelsius"])
def add(self, name):
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], thermals),
pc,
Temperature(name),
t["Name"],
t["ReadingCelsius"],
)

# We don't consider all sensors for now
# This could be updated depending on the needs
if s == "CPU":
add(sd)
add(self, sd)
elif s == "Inlet":
add(s)
add(self, s)
elif d == "DIMM":
# P1 DIMM 1-4
add(f"{s} {d} {de}")
add(self, f"{s} {d} {de}")
return thermals

def get_power(self):
Expand All @@ -78,10 +83,12 @@ def read_power_supplies(
# Let's update it to have a unique name
name = psu["Name"] + str(psu["Oem"]["Hpe"]["BayNumber"])
psu_name = "PS" + str(psu["Oem"]["Hpe"]["BayNumber"])
if name not in power_supplies[str(PowerContext.BMC)]:
power_supplies[str(PowerContext.BMC)][name] = Power(psu_name)
power_supplies[str(PowerContext.BMC)][name].add(
psu["Oem"]["Hpe"]["AveragePowerOutputWatts"]
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_supplies),
PowerContext.BMC,
Power(psu_name),
name,
psu["Oem"]["Hpe"]["AveragePowerOutputWatts"],
)

return power_supplies
Expand All @@ -98,26 +105,31 @@ def read_power_consumption(

# But for multi-server chassis, ...
if "HPE Apollo2000 Gen10+" in oem_chassis["Name"]:
if str(PowerContext.BMC) not in power_consumption:
power_consumption[str(PowerContext.BMC)] = {
str(PowerCat.SERVER): Power(str(PowerCat.SERVER)),
str(PowerCat.CHASSIS): Power(str(PowerCat.CHASSIS)),
str(PowerCat.SERVERINCHASSIS): Power(str(PowerCat.SERVERINCHASSIS)),
} # type: ignore[no-redef]

# On Apollo2000, the generic PowerConsumedWatts is fact SERVERINCHASSIS
power_consumption[str(PowerContext.BMC)][str(PowerCat.SERVERINCHASSIS)].add(
self.get_power().get("PowerControl")[0]["PowerConsumedWatts"]
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_consumption),
PowerContext.BMC,
Power(str(PowerCat.SERVERINCHASSIS)),
str(PowerCat.SERVERINCHASSIS),
self.get_power().get("PowerControl")[0]["PowerConsumedWatts"],
)

# And extract SERVER from NodePowerWatts
power_consumption[str(PowerContext.BMC)][str(PowerCat.SERVER)].add(
oem_chassis["Oem"]["Hpe"]["NodePowerWatts"]
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_consumption),
PowerContext.BMC,
Power(str(PowerCat.SERVER)),
str(PowerCat.SERVER),
oem_chassis["Oem"]["Hpe"]["NodePowerWatts"],
)

# And CHASSIS from ChassisPowerWatts
power_consumption[str(PowerContext.BMC)][str(PowerCat.CHASSIS)].add(
oem_chassis["Oem"]["Hpe"]["ChassisPowerWatts"]
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_consumption),
PowerContext.BMC,
Power(str(PowerCat.CHASSIS)),
str(PowerCat.CHASSIS),
oem_chassis["Oem"]["Hpe"]["ChassisPowerWatts"],
)
return power_consumption

Expand Down
41 changes: 36 additions & 5 deletions hwbench/environment/vendors/mock.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import cast
from ...bench.monitoring_structs import (
FanContext,
MonitorMetric,
Power,
PowerCategories,
PowerContext,
Temperature,
ThermalContext,
Expand All @@ -17,29 +19,58 @@ def read_thermals(
self, thermals: dict[str, dict[str, Temperature]] = {}
) -> dict[str, dict[str, Temperature]]:
# Let's add a faked thermal metric
thermals[str(ThermalContext.CPU)] = {"CPU1": Temperature("CPU1", 40)}
name = "CPU1"

super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], thermals),
ThermalContext.CPU,
Temperature(name),
name,
40,
)
return thermals

def read_fans(
self, fans: dict[str, dict[str, MonitorMetric]] = {}
) -> dict[str, dict[str, MonitorMetric]]:
# Let's add a faked fans metric
fans[str(FanContext.FAN)] = {"Fan1": MonitorMetric("Fan1", "RPM", 40)}
name = "Fan1"
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], fans),
FanContext.FAN,
MonitorMetric(name, "RPM"),
name,
40,
)
return fans

def read_power_consumption(
self, power_consumption: dict[str, dict[str, Power]] = {}
) -> dict[str, dict[str, Power]]:
# Let's add a faked power metric
power_consumption[str(PowerContext.BMC)] = {"Chassis": Power("Chassis", 125.0)}
name = str(PowerCategories.CHASSIS)
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_consumption),
PowerContext.BMC,
Power(name),
name,
125.0,
)
return power_consumption

def read_power_supplies(
self, power_supplies: dict[str, dict[str, Power]] = {}
) -> dict[str, dict[str, Power]]:
# Let's add a faked power supplies

power_supplies[str(PowerContext.BMC)] = {"PS1 status": Power("PS1", 125.0)}
status = "PS1 status"
name = "PS1"
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_supplies),
PowerContext.BMC,
Power(name),
status,
125,
)
return power_supplies

def connect_redfish(self):
Expand Down
17 changes: 17 additions & 0 deletions hwbench/environment/vendors/vendor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pathlib
import redfish # type: ignore
from abc import ABC, abstractmethod
from typing import Any
from ...utils import helpers as h
from ...utils.external import External
from ...bench.monitoring_structs import (
Expand All @@ -31,6 +32,22 @@ def __del__(self):
if self.logged:
self.redfish_obj.logout()

def add_monitoring_value(
self,
monitoring_struct: dict[str, dict[str, MonitorMetric]],
context: Any,
metric: MonitorMetric,
name: str,
value: float,
) -> dict[str, dict[str, MonitorMetric]]:
"""This function add a new <value> in the monitoring data structure."""
if str(context) not in monitoring_struct:
monitoring_struct[str(context)] = {}
if name not in monitoring_struct[str(context)]:
monitoring_struct[str(context)][name] = metric
monitoring_struct[str(context)][name].add(value)
return monitoring_struct

def run_cmd(self) -> list[str]:
return ["ipmitool", "lan", "print"]

Expand Down
1 change: 1 addition & 0 deletions requirements/base.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ numpy
matplotlib
redfish
pycairo
packaging
21 changes: 20 additions & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,9 @@ numpy==1.26.4 \
packaging==24.0 \
--hash=sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5 \
--hash=sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9
# via matplotlib
# via
# -r requirements/base.in
# matplotlib
pillow==10.3.0 \
--hash=sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c \
--hash=sha256:048eeade4c33fdf7e08da40ef402e748df113fd0b4584e32c4af74fe78baaeb2 \
Expand Down Expand Up @@ -475,6 +477,23 @@ ply==3.11 \
--hash=sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3 \
--hash=sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce
# via jsonpath-rw
pycairo==1.26.0 \
--hash=sha256:1d54e28170a5e790269d9db4c195cca5152ff018ba7e330d0ed05d86ccc2ea7d \
--hash=sha256:20a31af89d92ffd5fc60c08e65ff649f16e18621a14a40dbdb049fc74942d7a9 \
--hash=sha256:2dddd0a874fbddb21e14acd9b955881ee1dc6e63b9c549a192d613a907f9cbeb \
--hash=sha256:3e4e18ea03122e60abe3eb611e2849859cc950083ff85d8369328eadf3df63f5 \
--hash=sha256:5986b8da3e7de7ab931d7ad527938df38f75d3a3bdea2b515c786c5ca2c5093c \
--hash=sha256:675578bc6d62d15ff8669f264783efc9c8c73e3a6f564b294a70fb45a2f78667 \
--hash=sha256:696ba8024d2827e66e088a6e05a3b0aea30d289476bcb2ca47c9670d40900a50 \
--hash=sha256:8616408ae93de4824a3777ec532ea75643e4bf74e49d601062c0b1788180c962 \
--hash=sha256:9fa51168010e2dfb45499df071fca2d921893f724646f3454951000a7ad0cabb \
--hash=sha256:a611e4d82ad8470138bb46d465d47e8db826d9d80b6a520ccd83ee007f2073e4 \
--hash=sha256:a8f3b567ba2ad55624a809823ccf75aff8d768c20216cb5888365f6fc695c1d2 \
--hash=sha256:aac447b423b33b64119ecdd1ffebf9163b07f5401c5da50c707197efdd1c918a \
--hash=sha256:b6690a00fb225c19f42d76660e676aba7ae7cb18f3632cb02bce7f0d9b9c3800 \
--hash=sha256:d374d9ec6d2f791bf57105d87a9028db1ef2b687848f64a524e447033eae7229 \
--hash=sha256:d63929ab5a2f890a333f2f2f51de9f1c9fe20d1bddc982c2ca577b737448d72f
# via -r requirements/base.in
pyparsing==3.1.2 \
--hash=sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad \
--hash=sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742
Expand Down
17 changes: 17 additions & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,23 @@ ply==3.11 \
# via
# -r requirements/base.txt
# jsonpath-rw
pycairo==1.26.0 \
--hash=sha256:1d54e28170a5e790269d9db4c195cca5152ff018ba7e330d0ed05d86ccc2ea7d \
--hash=sha256:20a31af89d92ffd5fc60c08e65ff649f16e18621a14a40dbdb049fc74942d7a9 \
--hash=sha256:2dddd0a874fbddb21e14acd9b955881ee1dc6e63b9c549a192d613a907f9cbeb \
--hash=sha256:3e4e18ea03122e60abe3eb611e2849859cc950083ff85d8369328eadf3df63f5 \
--hash=sha256:5986b8da3e7de7ab931d7ad527938df38f75d3a3bdea2b515c786c5ca2c5093c \
--hash=sha256:675578bc6d62d15ff8669f264783efc9c8c73e3a6f564b294a70fb45a2f78667 \
--hash=sha256:696ba8024d2827e66e088a6e05a3b0aea30d289476bcb2ca47c9670d40900a50 \
--hash=sha256:8616408ae93de4824a3777ec532ea75643e4bf74e49d601062c0b1788180c962 \
--hash=sha256:9fa51168010e2dfb45499df071fca2d921893f724646f3454951000a7ad0cabb \
--hash=sha256:a611e4d82ad8470138bb46d465d47e8db826d9d80b6a520ccd83ee007f2073e4 \
--hash=sha256:a8f3b567ba2ad55624a809823ccf75aff8d768c20216cb5888365f6fc695c1d2 \
--hash=sha256:aac447b423b33b64119ecdd1ffebf9163b07f5401c5da50c707197efdd1c918a \
--hash=sha256:b6690a00fb225c19f42d76660e676aba7ae7cb18f3632cb02bce7f0d9b9c3800 \
--hash=sha256:d374d9ec6d2f791bf57105d87a9028db1ef2b687848f64a524e447033eae7229 \
--hash=sha256:d63929ab5a2f890a333f2f2f51de9f1c9fe20d1bddc982c2ca577b737448d72f
# via -r requirements/base.txt
pyparsing==3.1.2 \
--hash=sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad \
--hash=sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742
Expand Down

0 comments on commit cb123d0

Please sign in to comment.