graph: Adding CPU clock scaling graph

As per issue #39, it could be super useful to have a new type of graph to represent how the CPU clock performed during a benchmark. This new graph gives a quick overview on how the CPU frequency behave during a scaling benchmark. The rendering is not as precise as environment graphs, but it gives a brief overview with the following trade-offs for the y-err bars: - min of the yerr-bar, is the min of min values - mean of the yerr-bar, is the mean of mean values - max of the yerr-bar, is the max of the max values Signed-off-by: Erwan Velu <e.velu@criteo.com>
criteo · Oct 28, 2024 · eef217a · eef217a
1 parent f8c8e15
commit eef217a
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 1 deletion.
diff --git a/graph/graph.py b/graph/graph.py
@@ -17,7 +17,7 @@ def init_matplotlib(args):
         fatal(f"Cannot load matplotlib backend engine {args.engine}")
 
 
-GRAPH_TYPES = ["perf", "perf_watt", "watts"]
+GRAPH_TYPES = ["perf", "perf_watt", "watts", "cpu_clock"]
 
 
 class Graph:

diff --git a/graph/scaling.py b/graph/scaling.py
@@ -25,6 +25,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
         aggregated_perfs_watt = {}  # type: dict[str, dict[str, Any]]
         aggregated_watt = {}  # type: dict[str, dict[str, Any]]
         aggregated_watt_err = {}  # type: dict[str, dict[str, Any]]
+        aggregated_cpu_clock = {}  # type: dict[str, dict[str, Any]]
+        aggregated_cpu_clock_err = {}  # type: dict[str, dict[str, Any]]
         workers = {}  # type: dict[str, list]
         logical_core_per_worker = []
         perf_list, unit = benches[emp]["metrics"]
@@ -41,6 +43,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                 aggregated_perfs_watt[perf] = {}
                 aggregated_watt[perf] = {}
                 aggregated_watt_err[perf] = {}
+                aggregated_cpu_clock[perf] = {}
+                aggregated_cpu_clock_err[perf] = {}
             # For every trace file given at the command line
             for trace in args.traces:
                 workers[trace.get_name()] = []
@@ -63,13 +67,17 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                         aggregated_perfs_watt[perf][trace.get_name()] = []
                         aggregated_watt[perf][trace.get_name()] = []
                         aggregated_watt_err[perf][trace.get_name()] = []
+                        aggregated_cpu_clock[perf][trace.get_name()] = []
+                        aggregated_cpu_clock_err[perf][trace.get_name()] = []
 
                     bench.add_perf(
                         perf,
                         traces_perf=aggregated_perfs[perf][trace.get_name()],
                         perf_watt=aggregated_perfs_watt[perf][trace.get_name()],
                         watt=aggregated_watt[perf][trace.get_name()],
                         watt_err=aggregated_watt_err[perf][trace.get_name()],
+                        cpu_clock=aggregated_cpu_clock[perf][trace.get_name()],
+                        cpu_clock_err=aggregated_cpu_clock_err[perf][trace.get_name()],
                     )
 
         # Let's render all graphs types
@@ -94,6 +102,13 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                     outfile = f"scaling_watt_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
                     y_label = "Watts"
                     y_source = aggregated_watt
+                elif "cpu_clock" in graph_type:
+                    graph_type_title = (
+                        f"Scaling {graph_type}: {args.traces[0].get_metric_name()}"
+                    )
+                    outfile = f"scaling_cpu_clock_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
+                    y_label = "Mhz"
+                    y_source = aggregated_cpu_clock
                 else:
                     graph_type_title = (
                         f"Scaling {graph_type}: {bench.get_title_engine_name()}"
@@ -164,6 +179,16 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
                             capsize=4,
                             label=trace_name,
                         )
+                    elif y_source == aggregated_cpu_clock:
+                        graph.get_ax().errorbar(
+                            x_serie,
+                            y_serie,
+                            yerr=np.array(aggregated_cpu_clock_err[perf][trace_name]).T,
+                            ecolor=e_color,
+                            color=color_name,
+                            capsize=4,
+                            label=trace_name,
+                        )
                     else:
                         graph.get_ax().plot(
                             x_serie,

diff --git a/graph/trace.py b/graph/trace.py
@@ -232,6 +232,8 @@ def add_perf(
         perf_watt=None,
         watt=None,
         watt_err=None,
+        cpu_clock=None,
+        cpu_clock_err=None,
         index=None,
     ) -> None:
         """Extract performance and power efficiency"""
@@ -316,6 +318,41 @@ def add_perf(
                         watt_err.append(metric)
                     else:
                         watt_err[index] = metric
+
+            if cpu_clock is not None:
+                mm = self.get_monitoring_metric(Metrics.FREQ)
+                mean_values = []
+                min_values = []
+                max_values = []
+
+                for freq_metric in mm:
+                    if freq_metric != "CPU":
+                        continue
+                    # We have to compute metrics of all systems cores
+                    for core in mm[freq_metric]:
+                        # MIN of min ?
+                        # Mean of mean ?
+                        # Max of max ?
+                        min_values.append(min(mm[freq_metric][core].get_min()))
+                        mean_values.append(mean(mm[freq_metric][core].get_mean()))
+                        max_values.append(max(mm[freq_metric][core].get_max()))
+                    min_value = min(min_values)
+                    mean_value = mean(mean_values)
+                    max_value = max(max_values)
+
+                if index is None:
+                    cpu_clock.append(mean_value)
+                else:
+                    cpu_clock[index] = mean_value
+
+                # If we want to keep the error distribution to plot error bars
+                if cpu_clock_err is not None:
+                    metric = (mean_value - min_value, max_value - mean_value)
+                    if index is None:
+                        cpu_clock_err.append(metric)
+                    else:
+                        cpu_clock_err[index] = metric
+
         except ValueError:
             fatal(f"No {perf} found in {self.get_bench_name()}")