From 2cff00f6d2be338da9ff660093aa68c0cd2f51b3 Mon Sep 17 00:00:00 2001
From: Luis Perez <luisp@yelp.com>
Date: Thu, 27 Jun 2024 07:36:43 -0700
Subject: [PATCH] Allow task_processing-launched pods to handle their own
 cleanup

We have some usecases where we'd like a set of batches to run some
cleanup code upon recieving a SIGTERM, so we'll need to remove our
force-kill code and instead fallback to the default k8s behavior of
giving Pods 30s after sending a SIGTERM before sending a SIGKILL to
force-terminate :)

I will also follow this up with a Tron documentation change that
documents this behavior.
---
 task_processing/plugins/kubernetes/kube_client.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/task_processing/plugins/kubernetes/kube_client.py b/task_processing/plugins/kubernetes/kube_client.py
index bf8f797..7f9d3f3 100644
--- a/task_processing/plugins/kubernetes/kube_client.py
+++ b/task_processing/plugins/kubernetes/kube_client.py
@@ -99,12 +99,17 @@ def terminate_pod(
         while attempts:
             try:
                 logger.info(f"Attempting to terminate {pod_name}")
+                # NOTE: we used to force-kill pods by setting grace_period_seconds=0, but we later
+                # had usecases where we wanted to allow the pod to gracefully terminate (e.g., we
+                # have some applications that are prone to hanging due to legacy code that do not
+                # correctly report failures until the end of a successful execution).
+                #
+                # For the majority of our batches, the default grace period of 30s should
+                # reduce to a force-kill as most things will not specifically handle SIGTERMs in a
+                # special way :)
                 self.core.delete_namespaced_pod(
                     name=pod_name,
                     namespace=namespace,
-                    # attempt to delete immediately - Pods launched by task_processing
-                    # shouldn't need time to clean-up/drain
-                    grace_period_seconds=0,
                     # this is the default, but explcitly request background deletion of releated
                     # objects. see:
                     # https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/