From 2cff00f6d2be338da9ff660093aa68c0cd2f51b3 Mon Sep 17 00:00:00 2001 From: Luis Perez Date: Thu, 27 Jun 2024 07:36:43 -0700 Subject: [PATCH] Allow task_processing-launched pods to handle their own cleanup We have some usecases where we'd like a set of batches to run some cleanup code upon recieving a SIGTERM, so we'll need to remove our force-kill code and instead fallback to the default k8s behavior of giving Pods 30s after sending a SIGTERM before sending a SIGKILL to force-terminate :) I will also follow this up with a Tron documentation change that documents this behavior. --- task_processing/plugins/kubernetes/kube_client.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/task_processing/plugins/kubernetes/kube_client.py b/task_processing/plugins/kubernetes/kube_client.py index bf8f797..7f9d3f3 100644 --- a/task_processing/plugins/kubernetes/kube_client.py +++ b/task_processing/plugins/kubernetes/kube_client.py @@ -99,12 +99,17 @@ def terminate_pod( while attempts: try: logger.info(f"Attempting to terminate {pod_name}") + # NOTE: we used to force-kill pods by setting grace_period_seconds=0, but we later + # had usecases where we wanted to allow the pod to gracefully terminate (e.g., we + # have some applications that are prone to hanging due to legacy code that do not + # correctly report failures until the end of a successful execution). + # + # For the majority of our batches, the default grace period of 30s should + # reduce to a force-kill as most things will not specifically handle SIGTERMs in a + # special way :) self.core.delete_namespaced_pod( name=pod_name, namespace=namespace, - # attempt to delete immediately - Pods launched by task_processing - # shouldn't need time to clean-up/drain - grace_period_seconds=0, # this is the default, but explcitly request background deletion of releated # objects. see: # https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/