Skip to content

Commit

Permalink
Add serialization error counter
Browse files Browse the repository at this point in the history
  • Loading branch information
KaspariK committed Oct 29, 2024
1 parent e8dcc8e commit b0186d1
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 11 deletions.
6 changes: 6 additions & 0 deletions tron/prom_metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from prometheus_client import Counter
from prometheus_client import Gauge


tron_cpu_gauge = Gauge("tron_k8s_cpus", "Total number of CPUs allocated to Tron-launched containers")
tron_memory_gauge = Gauge("tron_k8s_mem", "Total amount of memory allocated to Tron-launched containers (in megabytes)")
tron_disk_gauge = Gauge("tron_k8s_disk", "Total amount of disk allocated to Tron-launched containers (in megabytes)")

json_serialization_errors_counter = Counter(
"json_serialization_errors_total",
"Total number of errors encountered while serializing state_data as JSON. These errors occur before writing to DynamoDB.",
)
24 changes: 13 additions & 11 deletions tron/serialize/runstate/dynamodb_state_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import boto3 # type: ignore

import tron.prom_metrics as prom_metrics
from tron.core.job import Job
from tron.core.jobrun import JobRun
from tron.metrics import timer
Expand Down Expand Up @@ -162,11 +163,7 @@ def save(self, key_value_pairs) -> None:
self.save_queue[key] = (val, None)
else:
state_type = self.get_type_from_key(key)
try:
json_val = self._serialize_item(state_type, val)
except Exception as e:
log.error(f"Failed to serialize JSON for key {key}: {e}")
json_val = None # Proceed without JSON if serialization fails
json_val = self._serialize_item(state_type, val)
self.save_queue[key] = (val, json_val)
break

Expand Down Expand Up @@ -205,12 +202,17 @@ def get_type_from_key(self, key: str) -> str:

# TODO: TRON-2305 - In an ideal world, we wouldn't be passing around state/state_data dicts. It would be a lot nicer to have regular objects here
def _serialize_item(self, key: Literal[runstate.JOB_STATE, runstate.JOB_RUN_STATE], state: Dict[str, Any]) -> Optional[str]: # type: ignore
if key == runstate.JOB_STATE:
return Job.to_json(state)
elif key == runstate.JOB_RUN_STATE:
return JobRun.to_json(state)
else:
raise ValueError(f"Unknown type: key {key}")
try:
if key == runstate.JOB_STATE:
return Job.to_json(state)
elif key == runstate.JOB_RUN_STATE:
return JobRun.to_json(state)
else:
raise ValueError(f"Unknown type: key {key}")
except Exception:
log.exception(f"Serialization error for key {key}")
prom_metrics.json_serialization_errors_counter.inc()
return None

def _save_loop(self):
while True:
Expand Down

0 comments on commit b0186d1

Please sign in to comment.