Skip to content

Commit

Permalink
Merge branch 'ecmwf:develop' into pr/stretched-grid-config
Browse files Browse the repository at this point in the history
  • Loading branch information
havardhhaugen authored Nov 11, 2024
2 parents 6e61e8d + 58dad76 commit 6038b1f
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ Keep it human-readable, your future self will thank you!
- Updated configuration examples in documentation and corrected links - [#46](https://github.com/ecmwf/anemoi-training/pull/46)
- Remove credential prompt from mlflow login, replace with seed refresh token via web - [#78](https://github.com/ecmwf/anemoi-training/pull/78)
- Update CODEOWNERS
- Change how mlflow measures CPU Memory usage - [94](https://github.com/ecmwf/anemoi-training/pull/94)

## [0.1.0 - Anemoi training - First release](https://github.com/ecmwf/anemoi-training/releases/tag/0.1.0) - 2024-08-16

Expand Down
51 changes: 50 additions & 1 deletion src/anemoi/training/diagnostics/mlflow/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,10 +433,59 @@ def experiment(self) -> MLFlowLogger.experiment:
def log_system_metrics(self) -> None:
"""Log system metrics (CPU, GPU, etc)."""
import mlflow
import psutil
from mlflow.system_metrics.metrics.base_metrics_monitor import BaseMetricsMonitor
from mlflow.system_metrics.metrics.disk_monitor import DiskMonitor
from mlflow.system_metrics.metrics.gpu_monitor import GPUMonitor
from mlflow.system_metrics.metrics.network_monitor import NetworkMonitor
from mlflow.system_metrics.system_metrics_monitor import SystemMetricsMonitor

class CustomCPUMonitor(BaseMetricsMonitor):
"""Class for monitoring CPU stats.
Extends default CPUMonitor, to also measure total \
memory and a different formula for calculating used memory.
"""

def collect_metrics(self) -> None:
# Get CPU metrics.
cpu_percent = psutil.cpu_percent()
self._metrics["cpu_utilization_percentage"].append(cpu_percent)

system_memory = psutil.virtual_memory()
# Change the formula for measuring CPU memory usage
# By default Mlflow uses psutil.virtual_memory().used
# Tests have shown that "used" underreports memory usage by as much as a factor of 2,
# "used" also misses increased memory usage from using a higher prefetch factor
self._metrics["system_memory_usage_megabytes"].append(
(system_memory.total - system_memory.available) / 1e6,
)
self._metrics["system_memory_usage_percentage"].append(system_memory.percent)

# QOL: report the total system memory in raw numbers
self._metrics["system_memory_total_megabytes"].append(system_memory.total / 1e6)

def aggregate_metrics(self) -> dict[str, int]:
return {k: round(sum(v) / len(v), 1) for k, v in self._metrics.items()}

class CustomSystemMetricsMonitor(SystemMetricsMonitor):
def __init__(self, run_id: str, resume_logging: bool = False):
super().__init__(run_id, resume_logging=resume_logging)

# Replace the CPUMonitor with custom implementation
self.monitors = [CustomCPUMonitor(), DiskMonitor(), NetworkMonitor()]
try:
gpu_monitor = GPUMonitor()
self.monitors.append(gpu_monitor)
except ImportError:
LOGGER.warning(
"`pynvml` is not installed, to log GPU metrics please run `pip install pynvml` \
to install it",
)

mlflow.enable_system_metrics_logging()
system_monitor = SystemMetricsMonitor(
system_monitor = CustomSystemMetricsMonitor(
self.run_id,
resume_logging=self.run_id is not None,
)
Expand Down

0 comments on commit 6038b1f

Please sign in to comment.