From a96983f22027ca0ffd6073f83832aeeccd9a2200 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibaut=20Barr=C3=A8re?= Date: Thu, 16 Jan 2025 10:56:28 +0100 Subject: [PATCH] Stop halting the system in the middle of the test suite Also, return a non-zero exit code to provide a extra hint which would have helped me here. --- .../transport_web/plugs/worker_healthcheck.ex | 28 +++++++++++++++++-- .../plugs/worker_healthcheck_test.exs | 6 +++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/apps/transport/lib/transport_web/plugs/worker_healthcheck.ex b/apps/transport/lib/transport_web/plugs/worker_healthcheck.ex index 3066e88e5f..c8266edda6 100644 --- a/apps/transport/lib/transport_web/plugs/worker_healthcheck.ex +++ b/apps/transport/lib/transport_web/plugs/worker_healthcheck.ex @@ -43,8 +43,7 @@ defmodule TransportWeb.Plugs.WorkerHealthcheck do # if the app is completely down. if !healthy_state?() do Logger.info("Hot-fix: shutting down!!!") - # "Asynchronously and carefully stops the Erlang runtime system." - System.stop() + stop_the_beam!() end conn @@ -53,6 +52,31 @@ defmodule TransportWeb.Plugs.WorkerHealthcheck do end end + @doc """ + A fix for https://github.com/etalab/transport-site/issues/4377. + + If the worker sees that no jobs have been attempted by Oban for some time, + this plug's logic stops the whole program (BEAM/VM) completely. Because the + Clever Cloud monitoring checks that they can open a socket to the 8080 port, + this makes the test fails, hence resulting in an automatic restart. + + This is a cheap but so far effective way to ensure the worker gets restarted + when it malfunctions. + """ + def stop_the_beam! do + # "Asynchronously and carefully stops the Erlang runtime system." + if Mix.env() == :test do + # We do not want to stop the system during tests, because it + # gives the impression the test suite completed successfully, but + # it would actually just bypass all the tests after the one running this! + raise "would halt the BEAM" + else + # Also make sure to return with a non-zero exit code, to more clearly + # indicate that this is not the normal output + System.stop(1) + end + end + def store_last_attempted_at_delay_metric do value = DateTime.diff(oban_last_attempted_at(), DateTime.utc_now(), :second) Appsignal.add_distribution_value("oban.last_attempted_at_delay", value) diff --git a/apps/transport/test/transport_web/plugs/worker_healthcheck_test.exs b/apps/transport/test/transport_web/plugs/worker_healthcheck_test.exs index ce6da59f9d..cc1c17ef4e 100644 --- a/apps/transport/test/transport_web/plugs/worker_healthcheck_test.exs +++ b/apps/transport/test/transport_web/plugs/worker_healthcheck_test.exs @@ -121,7 +121,11 @@ defmodule TransportWeb.Plugs.WorkerHealthcheckTest do refute WorkerHealthcheck.oban_attempted_jobs_recently?() refute WorkerHealthcheck.healthy_state?() - assert conn |> WorkerHealthcheck.call(if: {__MODULE__, :plug_enabled?}) |> text_response(503) + # Current fix for job ops troubles: the system must stop completely when unhealthy, + # so that Clever Cloud will restart it. + assert_raise RuntimeError, ~r/would halt the BEAM/, fn -> + conn |> WorkerHealthcheck.call(if: {__MODULE__, :plug_enabled?}) + end end end