From 0b27c4e77acd1cdd42dab4bd7d2c42d431e922f4 Mon Sep 17 00:00:00 2001 From: "Illuminatus [CCIO]" Date: Mon, 6 Jan 2025 10:29:57 -0800 Subject: [PATCH 1/2] Include healthcheck logic for helper scripts running as sidecars --- files/docker/node/addons/healthcheck.sh | 155 +++++++++++++++++++----- 1 file changed, 122 insertions(+), 33 deletions(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index 4de0837b1..7bd9a7127 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -1,43 +1,132 @@ -#!/usr/bin/env bash +#!/bin/bash +# shellcheck source=/dev/null +# +###################################### +# User Variables - Change as desired # +# Common variables set in env file # +###################################### -source /opt/cardano/cnode/scripts/env +ENTRYPOINT_PROCESS="${ENTRYPOINT_PROCESS:-cnode.sh}" # Get the script from ENTRYPOINT_PROCESS or default to "cnode.sh" if not set +CPU_THRESHOLD="${CPU_THRESHOLD:-80}" # The CPU threshold to warn about if the sidecar process exceeds this for more than 60 seconds, defaults to 80% +RETRIES="${RETRIES:-20}" # The number of retries if tip is not incrementing, or cpu usage is over the threshold -CCLI=$(which cardano-cli) +###################################### +# Do NOT modify code below # +###################################### -if [[ "$NETWORK" == "guild-mainnet" ]]; then NETWORK=mainnet; fi +if [[ "${ENTRYPOINT_PROCESS}" == "cnode.sh" ]]; then + source /opt/cardano/cnode/scripts/env +else + # Source in offline mode for sidecar helper scripts + source /opt/cardano/cnode/scripts/env offline +fi -# For querying tip, the seperation of testnet-magic vs mainnet as argument is optional +# Define a mapping of scripts to their corresponding binaries, when defined check the binary is running and its CPU usage instead of the wrapper script. +declare -A SCRIPT_TO_BINARY_MAP +SCRIPT_TO_BINARY_MAP=( + ["cncli.sh"]="cncli" + ["mithril-signer.sh"]="mithril-signer" +) -FIRST=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) +# Define scripts which may sleep between executions of the binary. +SLEEPING_SCRIPTS=("cncli.sh") -if [[ "${ENABLE_KOIOS}" == "N" ]] || [[ -z "${KOIOS_API}" ]]; then - # when KOIOS is not enabled or KOIOS_API is unset, use default behavior - sleep 60; - SECOND=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) - if [[ "$FIRST" -ge "$SECOND" ]]; then - echo "there is a problem" - exit 1 +# Function to check if a process is running and its CPU usage +check_process() { + local process_name="$1" + local cpu_threshold="$2" + + for (( CHECK=1; CHECK<=RETRIES; CHECK++ )); do + # Check CPU usage of the process + CPU_USAGE=$(ps -C "$process_name" -o %cpu= | awk '{s+=$1} END {print s}') + + # Check if CPU usage exceeds threshold + if (( CPU_USAGE > cpu_threshold )); then + echo "Warning: High CPU usage detected for '$process_name' ($CPU_USAGE%)" + sleep 3 # Retry after a pause + continue + fi + + # Check if ENTRYPOINT_PROCESS is in the SLEEPING_SCRIPTS array + if [[ " ${SLEEPING_SCRIPTS[@]} " =~ " ${ENTRYPOINT_PROCESS} " ]]; then + # If the process is in SLEEPING_SCRIPTS, check if either the process or 'sleep' is running + if ! pgrep -x "$process_name" > /dev/null && ! pgrep -x "sleep" > /dev/null; then + echo "Error: '$process_name' is not running, and no 'sleep' process found" + return 3 # Return 3 if the process is not running and sleep is not found + fi + else + # If the process is not in SLEEPING_SCRIPTS, only check for the specific process + if ! pgrep -x "$process_name" > /dev/null; then + echo "Error: '$process_name' is not running" + return 3 # Return 3 if the process is not running + fi + fi + + echo "We're healthy - $process_name" + return 0 # Return 0 if the process is healthy + done + + echo "Max retries reached for $process_name" + return 1 # Return 1 if retries are exhausted +} + + +# Function to check if the node is running and is on tip +check_node() { + CCLI=$(which cardano-cli) + + # Adjust NETWORK variable if needed + if [[ "$NETWORK" == "guild-mainnet" ]]; then NETWORK=mainnet; fi + + FIRST=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block) + + if [[ "${ENABLE_KOIOS}" == "N" ]] || [[ -z "${KOIOS_API}" ]]; then + sleep 60 + SECOND=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block) + if [[ "$FIRST" -ge "$SECOND" ]]; then + echo "There is a problem" + exit 1 + else + echo "We're healthy - node: $FIRST -> node: $SECOND" + fi else - echo "we're healthy - node: $FIRST -> node: $SECOND" + CURL=$(which curl) + JQ=$(which jq) + URL="${KOIOS_API}/tip" + SECOND=$($CURL -s "${URL}" | $JQ '.[0].block_no') + + for (( CHECK=1; CHECK<=RETRIES; CHECK++ )); do + if [[ "$FIRST" -eq "$SECOND" ]]; then + echo "We're healthy - node: $FIRST == koios: $SECOND" + exit 0 + elif [[ "$FIRST" -lt "$SECOND" ]]; then + sleep 3 + FIRST=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block) + elif [[ "$FIRST" -gt "$SECOND" ]]; then + sleep 3 + SECOND=$($CURL "${KOIOS_URL}" | $JQ '.[0].block_no') + fi + done + echo "There is a problem" + exit 1 fi +} + +# MAIN +if [[ "$ENTRYPOINT_PROCESS" == "cnode.sh" ]]; then + # The original health check logic for "cnode.sh" + check_node else - # else leverage koios and only require the node is on tip - CURL=$(which curl) - JQ=$(which jq) - URL="${KOIOS_API}/tip" - SECOND=$($CURL -s "${URL}" | $JQ '.[0].block_no') - for (( CHECK=1; CHECK<=20; CHECK++ )); do - if [[ "$FIRST" -eq "$SECOND" ]]; then - echo "we're healthy - node: $FIRST == koios: $SECOND" - exit 0 - elif [[ "$FIRST" -lt "$SECOND" ]]; then - sleep 3 - FIRST=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block) - elif [[ "$FIRST" -gt "$SECOND" ]]; then - sleep 3 - SECOND=$($CURL "${KOIOS_URL}" | $JQ '.[0].block_no') - fi - done - echo "there is a problem" - exit 1 + # Determine the process name or script to check health + if [[ -n "${SCRIPT_TO_BINARY_MAP[$ENTRYPOINT_PROCESS]}" ]]; then + process="${SCRIPT_TO_BINARY_MAP[$ENTRYPOINT_PROCESS]}" + fi + echo "Checking health for process: $process" + check_process "$process" "$CPU_THRESHOLD" + exit $? fi + +# If all checks pass, return healthy status +echo "Container is healthy" +exit 0 + From 257647c8c699ed519b942cc197bf92c4605430da Mon Sep 17 00:00:00 2001 From: illuminatus Date: Tue, 7 Jan 2025 16:18:05 -0800 Subject: [PATCH 2/2] Round CPU_USAGE to an int and if less than 0.5 from threshold cause it to round up. --- files/docker/node/addons/healthcheck.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/docker/node/addons/healthcheck.sh b/files/docker/node/addons/healthcheck.sh index 7bd9a7127..d91ded7a6 100755 --- a/files/docker/node/addons/healthcheck.sh +++ b/files/docker/node/addons/healthcheck.sh @@ -38,7 +38,7 @@ check_process() { for (( CHECK=1; CHECK<=RETRIES; CHECK++ )); do # Check CPU usage of the process - CPU_USAGE=$(ps -C "$process_name" -o %cpu= | awk '{s+=$1} END {print s}') + CPU_USAGE=$(ps -C "$process_name" -o %cpu= | awk '{s+=$1} END {print int(s + 0.5)}') # Check if CPU usage exceeds threshold if (( CPU_USAGE > cpu_threshold )); then