From 2ce62adae913e0264ce5a87f64a9e014c0c65112 Mon Sep 17 00:00:00 2001 From: Kevin Kaspari Date: Mon, 28 Oct 2024 13:54:34 -0700 Subject: [PATCH] Add serialization error counter --- tron/prom_metrics.py | 6 +++++ .../runstate/dynamodb_state_store.py | 24 ++++++++++--------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/tron/prom_metrics.py b/tron/prom_metrics.py index 4a11bb8fd..d6eef803c 100644 --- a/tron/prom_metrics.py +++ b/tron/prom_metrics.py @@ -1,6 +1,12 @@ +from prometheus_client import Counter from prometheus_client import Gauge tron_cpu_gauge = Gauge("tron_k8s_cpus", "Total number of CPUs allocated to Tron-launched containers") tron_memory_gauge = Gauge("tron_k8s_mem", "Total amount of memory allocated to Tron-launched containers (in megabytes)") tron_disk_gauge = Gauge("tron_k8s_disk", "Total amount of disk allocated to Tron-launched containers (in megabytes)") + +json_serialization_errors_counter = Counter( + "json_serialization_errors_total", + "Total number of errors encountered while serializing state_data as JSON. These errors occur before writing to DynamoDB.", +) diff --git a/tron/serialize/runstate/dynamodb_state_store.py b/tron/serialize/runstate/dynamodb_state_store.py index 06b5bbabd..34b3a6976 100644 --- a/tron/serialize/runstate/dynamodb_state_store.py +++ b/tron/serialize/runstate/dynamodb_state_store.py @@ -20,6 +20,7 @@ import boto3 # type: ignore +import tron.prom_metrics as prom_metrics from tron.core.job import Job from tron.core.jobrun import JobRun from tron.metrics import timer @@ -162,11 +163,7 @@ def save(self, key_value_pairs) -> None: self.save_queue[key] = (val, None) else: state_type = self.get_type_from_key(key) - try: - json_val = self._serialize_item(state_type, val) - except Exception as e: - log.error(f"Failed to serialize JSON for key {key}: {e}") - json_val = None # Proceed without JSON if serialization fails + json_val = self._serialize_item(state_type, val) self.save_queue[key] = (val, json_val) break @@ -205,12 +202,17 @@ def get_type_from_key(self, key: str) -> str: # TODO: TRON-2305 - In an ideal world, we wouldn't be passing around state/state_data dicts. It would be a lot nicer to have regular objects here def _serialize_item(self, key: Literal[runstate.JOB_STATE, runstate.JOB_RUN_STATE], state: Dict[str, Any]) -> Optional[str]: # type: ignore - if key == runstate.JOB_STATE: - return Job.to_json(state) - elif key == runstate.JOB_RUN_STATE: - return JobRun.to_json(state) - else: - raise ValueError(f"Unknown type: key {key}") + try: + if key == runstate.JOB_STATE: + return Job.to_json(state) + elif key == runstate.JOB_RUN_STATE: + return JobRun.to_json(state) + else: + raise ValueError(f"Unknown type: key {key}") + except Exception: + log.exception(f"Serialization error for key {key}") + prom_metrics.json_serialization_errors_counter.inc() + return None def _save_loop(self): while True: