Skip to content

Commit

Permalink
Merge pull request #34 from criteo/various
Browse files Browse the repository at this point in the history
A collection of fixes to improve performance and stability
  • Loading branch information
ErwanAliasr1 authored Oct 28, 2024
2 parents f2fa9c2 + eef217a commit a930da3
Show file tree
Hide file tree
Showing 10 changed files with 193 additions and 36 deletions.
35 changes: 29 additions & 6 deletions graph/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def init_matplotlib(args):
fatal(f"Cannot load matplotlib backend engine {args.engine}")


GRAPH_TYPES = ["perf", "perf_watt", "watts"]
GRAPH_TYPES = ["perf", "perf_watt", "watts", "cpu_clock"]


class Graph:
Expand Down Expand Up @@ -260,9 +260,32 @@ def generic_graph(
time_serie.append(time)
# Collect all components mean value
for component in components:
if component.get_name() not in data_serie:
data_serie[component.get_name()] = []
data_serie[component.get_name()].append(component.get_mean()[sample])
if component.get_full_name() not in data_serie:
data_serie[component.get_full_name()] = []
# If we are missing some datapoints ....
if len(component.get_mean()) <= sample:
# If the user didn't explictely agreed to be replaced by 0, let's be fatal
if not args.ignore_missing_datapoint:
fatal(
f"{trace.get_name()}/{bench.get_bench_name()}: {component.get_full_name()} is missing the {sample+1}th data point.\
Use --ignore-missing-datapoint to ignore this case. Generated graphs will be partially incorrect."
)
else:
# User is fine with a missing data to be replaced.
# Let's do that so we can render things properly.

# Let's pick the last known value
if args.ignore_missing_datapoint == "last":
data_serie[component.get_full_name()].append(
component.get_mean()[-1]
)
else:
# Replace it by a zero
data_serie[component.get_full_name()].append(0)
else:
data_serie[component.get_full_name()].append(
component.get_mean()[sample]
)

if second_axis:
for _, entry in bench.get_monitoring_metric(second_axis).items():
Expand Down Expand Up @@ -291,8 +314,8 @@ def generic_graph(
graph.get_ax2().plot(x_serie, y2_serie, "", label=data2_item, marker="o")

for component in components:
y_serie = np.array(data_serie[component.get_name()])[order]
graph.get_ax().plot(x_serie, y_serie, "", label=component.get_name())
y_serie = np.array(data_serie[component.get_full_name()])[order]
graph.get_ax().plot(x_serie, y_serie, "", label=component.get_full_name())

graph.prepare_axes(
30,
Expand Down
29 changes: 21 additions & 8 deletions graph/hwgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,20 @@ def graph_monitoring_metrics(args, trace: Trace, bench_name: str, output_dir) ->
metrics = bench.get_component(Metrics.MONITOR, metric_name)
if metrics:
for metric in metrics:
rendered_graphs += yerr_graph(
args,
output_dir,
bench,
Metrics.MONITOR,
metrics[metric],
prefix=f"{metric_name}.",
)
# If a metric has no measure, let's ignore it
if len(metrics[metric].get_samples()) == 0:
print(
f"{bench_name}: No samples found in {metric_name}.{metric}, ignoring metric."
)
else:
rendered_graphs += yerr_graph(
args,
output_dir,
bench,
Metrics.MONITOR,
metrics[metric],
prefix=f"{metric_name}.",
)

return rendered_graphs

Expand Down Expand Up @@ -344,6 +350,13 @@ def main():
action="store_true",
help="Enable verbose mode",
)

parser_graph.add_argument(
"--ignore-missing-datapoint",
choices=["zero", "last"],
default="",
help="Replace a missing datapoint instead of stopping the rendering. Could be by a zero or the last known value.",
)
parser_graph.set_defaults(func=render_traces)

parser_list = subparsers.add_parser(
Expand Down
25 changes: 25 additions & 0 deletions graph/scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
aggregated_perfs_watt = {} # type: dict[str, dict[str, Any]]
aggregated_watt = {} # type: dict[str, dict[str, Any]]
aggregated_watt_err = {} # type: dict[str, dict[str, Any]]
aggregated_cpu_clock = {} # type: dict[str, dict[str, Any]]
aggregated_cpu_clock_err = {} # type: dict[str, dict[str, Any]]
workers = {} # type: dict[str, list]
logical_core_per_worker = []
perf_list, unit = benches[emp]["metrics"]
Expand All @@ -41,6 +43,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
aggregated_perfs_watt[perf] = {}
aggregated_watt[perf] = {}
aggregated_watt_err[perf] = {}
aggregated_cpu_clock[perf] = {}
aggregated_cpu_clock_err[perf] = {}
# For every trace file given at the command line
for trace in args.traces:
workers[trace.get_name()] = []
Expand All @@ -63,13 +67,17 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
aggregated_perfs_watt[perf][trace.get_name()] = []
aggregated_watt[perf][trace.get_name()] = []
aggregated_watt_err[perf][trace.get_name()] = []
aggregated_cpu_clock[perf][trace.get_name()] = []
aggregated_cpu_clock_err[perf][trace.get_name()] = []

bench.add_perf(
perf,
traces_perf=aggregated_perfs[perf][trace.get_name()],
perf_watt=aggregated_perfs_watt[perf][trace.get_name()],
watt=aggregated_watt[perf][trace.get_name()],
watt_err=aggregated_watt_err[perf][trace.get_name()],
cpu_clock=aggregated_cpu_clock[perf][trace.get_name()],
cpu_clock_err=aggregated_cpu_clock_err[perf][trace.get_name()],
)

# Let's render all graphs types
Expand All @@ -94,6 +102,13 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
outfile = f"scaling_watt_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
y_label = "Watts"
y_source = aggregated_watt
elif "cpu_clock" in graph_type:
graph_type_title = (
f"Scaling {graph_type}: {args.traces[0].get_metric_name()}"
)
outfile = f"scaling_cpu_clock_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
y_label = "Mhz"
y_source = aggregated_cpu_clock
else:
graph_type_title = (
f"Scaling {graph_type}: {bench.get_title_engine_name()}"
Expand Down Expand Up @@ -164,6 +179,16 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
capsize=4,
label=trace_name,
)
elif y_source == aggregated_cpu_clock:
graph.get_ax().errorbar(
x_serie,
y_serie,
yerr=np.array(aggregated_cpu_clock_err[perf][trace_name]).T,
ecolor=e_color,
color=color_name,
capsize=4,
label=trace_name,
)
else:
graph.get_ax().plot(
x_serie,
Expand Down
39 changes: 38 additions & 1 deletion graph/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def load_monitoring(self):
mm = Temperature(measure, original_measure["unit"])
else:
mm = MonitorMetric(measure, original_measure["unit"])
mm.load_from_dict(original_measure)
mm.load_from_dict(original_measure, measure)
self.metrics[metric][component_family][measure] = mm
else:
fatal(f"Unexpected {metric} in monitoring")
Expand Down Expand Up @@ -232,6 +232,8 @@ def add_perf(
perf_watt=None,
watt=None,
watt_err=None,
cpu_clock=None,
cpu_clock_err=None,
index=None,
) -> None:
"""Extract performance and power efficiency"""
Expand Down Expand Up @@ -316,6 +318,41 @@ def add_perf(
watt_err.append(metric)
else:
watt_err[index] = metric

if cpu_clock is not None:
mm = self.get_monitoring_metric(Metrics.FREQ)
mean_values = []
min_values = []
max_values = []

for freq_metric in mm:
if freq_metric != "CPU":
continue
# We have to compute metrics of all systems cores
for core in mm[freq_metric]:
# MIN of min ?
# Mean of mean ?
# Max of max ?
min_values.append(min(mm[freq_metric][core].get_min()))
mean_values.append(mean(mm[freq_metric][core].get_mean()))
max_values.append(max(mm[freq_metric][core].get_max()))
min_value = min(min_values)
mean_value = mean(mean_values)
max_value = max(max_values)

if index is None:
cpu_clock.append(mean_value)
else:
cpu_clock[index] = mean_value

# If we want to keep the error distribution to plot error bars
if cpu_clock_err is not None:
metric = (mean_value - min_value, max_value - mean_value)
if index is None:
cpu_clock_err.append(metric)
else:
cpu_clock_err[index] = metric

except ValueError:
fatal(f"No {perf} found in {self.get_bench_name()}")

Expand Down
4 changes: 2 additions & 2 deletions hwbench/bench/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ def next_iter():
start_time = self.get_monotonic_clock()
if self.turbostat:
# Turbostat will run for the whole duration of this loop
# We just retract a 2/10th of second to ensure it will not overdue
self.turbostat.run(interval=(precision - 0.2))
# We just retract a 5/10th of second to ensure it will not overdue
self.turbostat.run(interval=(precision - 0.5))
# Let's monitor the time spent at monitoring the CPU
self.get_metric(Metrics.MONITOR)["CPU"]["Polling"].add(
(self.get_monotonic_clock() - start_time) * 1e-6
Expand Down
9 changes: 8 additions & 1 deletion hwbench/bench/monitoring_structs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def __post_init__(self):
# If a value is given, let's add it
self.add(self.value)

def load_from_dict(self, input: dict):
def load_from_dict(self, input: dict, full_name: str):
self.full_name = full_name
self.name = str(input.get("name"))
self.unit = str(input.get("unit"))
self.mean = input.get("mean") # type: ignore[assignment]
Expand All @@ -35,6 +36,12 @@ def load_from_dict(self, input: dict):
self.stdev = input.get("stdev") # type: ignore[assignment]
self.samples = input.get("samples") # type: ignore[assignment]

def get_full_name(self):
"""Return the metric full name"""
if self.full_name:
return self.full_name
return self.name

def get_name(self):
"""Return the metric name"""
return self.name
Expand Down
20 changes: 16 additions & 4 deletions hwbench/environment/vendors/dell/dell.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@


class IDRAC(BMC):
oem_endpoint = ""

def get_thermal(self):
return self.get_redfish_url("/redfish/v1/Chassis/System.Embedded.1/Thermal")

Expand Down Expand Up @@ -42,17 +44,27 @@ def get_power(self):
return self.get_redfish_url("/redfish/v1/Chassis/System.Embedded.1/Power")

def get_oem_system(self):
# If we already found the proper endpoint, let's reuse it.
if self.oem_endpoint:
return self.get_redfish_url(
self.oem_endpoint,
log_failure=False,
)

new_oem_endpoint = "/redfish/v1/Managers/iDRAC.Embedded.1/Oem/Dell/DellAttributes/System.Embedded.1"
oem = self.get_redfish_url(
"/redfish/v1/Managers/iDRAC.Embedded.1/Oem/Dell/DellAttributes/System.Embedded.1",
new_oem_endpoint,
log_failure=False,
)
# If not System.Embedded, let's use the default attributes
if "Attributes" not in oem:
oem = self.get_redfish_url(
"/redfish/v1/Managers/iDRAC.Embedded.1/Attributes"
)
new_oem_endpoint = "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes"
oem = self.get_redfish_url(new_oem_endpoint)
if "Attributes" not in oem:
h.fatal("Cannot find Dell OEM metrics, please fill a bug.")

# Let's save the endpoint to avoid trying all of them at every run
self.oem_endpoint = new_oem_endpoint
return oem

def read_power_consumption(
Expand Down
61 changes: 48 additions & 13 deletions hwbench/environment/vendors/hpe/hpe.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
import pathlib
import re
from functools import cache
from typing import cast

from ....bench.monitoring_structs import (
Expand Down Expand Up @@ -84,24 +86,45 @@ def add(self, name):
def get_power(self):
return self.get_redfish_url("/redfish/v1/Chassis/1/Power/")

@cache
def __warn_psu(self, psu_number, message):
logging.error(f"PSU {psu_number}: {message}")

def read_power_supplies(
self, power_supplies: dict[str, dict[str, Power]] = {}
) -> dict[str, dict[str, Power]]:
"""Return power supplies power from server"""
if str(PowerContext.BMC) not in power_supplies:
power_supplies[str(PowerContext.BMC)] = {} # type: ignore[no-redef]
for psu in self.get_power().get("PowerSupplies"):
# Both PSUs are named the same (HpeServerPowerSupply)
psu_position = str(psu["Oem"]["Hpe"]["BayNumber"])
# All PSUs are named the same (HpeServerPowerSupply)
# Let's update it to have a unique name
name = psu["Name"] + str(psu["Oem"]["Hpe"]["BayNumber"])
psu_name = "PS" + str(psu["Oem"]["Hpe"]["BayNumber"])
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_supplies),
PowerContext.BMC,
Power(psu_name),
name,
psu["Oem"]["Hpe"]["AveragePowerOutputWatts"],
)
psu_status = psu.get("Status")
if psu_status:
psu_state = psu_status.get("State")
if psu_state:
# We only consider healthy PSU
if str(psu_state).lower() == "enabled":
name = psu["Name"] + psu_position
psu_name = "PS" + psu_position
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_supplies),
PowerContext.BMC,
Power(psu_name),
name,
psu["Oem"]["Hpe"]["AveragePowerOutputWatts"],
)
else:
# Let's inform the user the PSU is reported as non healthy
self.__warn_psu(
psu_position,
f'marked as {psu_state} in {psu_status.get("Health")} state',
)
continue

# Let's inform the user that no status was found, maybe a parsing or fw issue ?
self.__warn_psu(psu_position, "no status or state found !")

return power_supplies

Expand Down Expand Up @@ -145,11 +168,23 @@ def read_power_consumption(
)
return power_consumption

def get_oem_chassis(self):
return self.get_redfish_url(
"/redfish/v1/Chassis/enclosurechassis/", log_failure=False
@cache
def is_multinode_chassis(self) -> bool:
return (
True
if self.get_redfish_url(
"/redfish/v1/Chassis/enclosurechassis/", log_failure=False
)
else False
)

def get_oem_chassis(self):
if self.is_multinode_chassis():
return self.get_redfish_url(
"/redfish/v1/Chassis/enclosurechassis/", log_failure=False
)
return {}


class Hpe(Vendor):
def __init__(self, out_dir, dmi, monitoring_config_filename):
Expand Down
Loading

0 comments on commit a930da3

Please sign in to comment.