From 0e3fe4a4271b1ef877f9207b18566ba819a11162 Mon Sep 17 00:00:00 2001 From: Ioannis Tsouvalas Date: Thu, 9 Jan 2025 17:58:01 +0200 Subject: [PATCH] fix: storage health fix and consistency handling - Added a delay (`sleep 5`) in `bin/shield-pipe` to address eventual consistency issues in the storage backend. - Updated `core/scheduler/chore.go` to ensure the final retry attempt determines the health state, preventing premature unhealthy state marking. - Improved health check parsing by ensuring only the last line of the output is evaluated. - Adjusted error handling to avoid marking storage as unhealthy on transient failures. - Added debug logging for improved visibility during health state evaluation. Resolves: Issue with health state persisting as unhealthy after successful retries, and test-store failure due to eventual consistency issues. --- bin/shield-pipe | 4 +++ core/scheduler/chore.go | 64 ++++++++++++++++++++++++++++------------- 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/bin/shield-pipe b/bin/shield-pipe index 18bce711b..23b9df8ac 100755 --- a/bin/shield-pipe +++ b/bin/shield-pipe @@ -185,6 +185,10 @@ EOF exit 2 fi + # Add a delay to account for eventual consistency + say "waiting 5 seconds to account for storage consistency..." + sleep 5 + output=$(${SHIELD_STORE_PLUGIN} retrieve -e "${SHIELD_STORE_ENDPOINT}" -k "${key}" || true) ${SHIELD_STORE_PLUGIN} purge -e "${SHIELD_STORE_ENDPOINT}" -k "${key}" || true diff --git a/core/scheduler/chore.go b/core/scheduler/chore.go index 3a7e8710a..aa2e54968 100644 --- a/core/scheduler/chore.go +++ b/core/scheduler/chore.go @@ -291,6 +291,8 @@ func (w *Worker) Execute(chore Chore) { w.db.UpdateTaskLog(task.UUID, "\n\n") case db.TestStoreOperation: + // DEBUG (uncomment to use) Print out the raw output that we are about to parse + // w.db.UpdateTaskLog(task.UUID, fmt.Sprintf("\nDEBUG: Attempting to parse output:\n%s\n", output)) var v struct { Healthy bool `json:"healthy"` } @@ -302,36 +304,58 @@ func (w *Worker) Execute(chore Chore) { if store == nil { panic(fmt.Errorf("store '%s' not found in database", task.StoreUUID)) } + // To account for multiple retries, we'll parse the last line of the output to determine the health of the store + lines := strings.Split(strings.TrimSpace(output), "\n") + lastLine := lines[len(lines)-1] - if rc == 0 { - err = json.Unmarshal([]byte(output), &v) + // Attempt to parse {"healthy":true} or {"healthy":false} from `output` + err = json.Unmarshal([]byte(lastLine), &v) + if err != nil { + // If we can't parse it, assume final result is unhealthy + w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: unable to parse script output; marking storage system as UNHEALTHY.\n") + // If parsing fails, also log the error + w.db.UpdateTaskLog(task.UUID, fmt.Sprintf("DEBUG: json.Unmarshal failed: %s\n", err)) + store.Healthy = false + + // Update DB record + err = w.db.UpdateStoreHealth(store) if err != nil { - panic(fmt.Errorf("failed to unmarshal output [%s] from %s operation: %s", output, task.Op, err)) + panic(fmt.Errorf("failed to update store '%s' record in database: %s", task.StoreUUID, err)) } - if v.Healthy { - if store.Healthy != v.Healthy { - w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: marking storage system as HEALTHY (recovery).\n") - } else { - w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: storage is still HEALTHY.\n") - } + + // Mark the task as failed + w.db.FailTask(chore.TaskUUID, time.Now()) + return + } + + // Now, if v.Healthy == true, we mark the store healthy; otherwise, unhealthy + if v.Healthy { + // DEBUG (uncomment to use) Print out the result of the JSON parse + // w.db.UpdateTaskLog(task.UUID, fmt.Sprintf("DEBUG: Successfully parsed JSON. v.Healthy = %t\n", v.Healthy)) + if !store.Healthy { + w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: marking storage system as HEALTHY (recovery).\n") } else { - w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: marking storage system as UNHEALTHY.\n") + w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: storage is still HEALTHY.\n") + } + store.Healthy = true + + err = w.db.UpdateStoreHealth(store) + if err != nil { + panic(fmt.Errorf("failed to update store '%s' record in database: %s", task.StoreUUID, err)) } - store.Healthy = v.Healthy + w.db.CompleteTask(chore.TaskUUID, time.Now()) } else { - store.Healthy = false w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: marking storage system as UNHEALTHY.\n") - } + store.Healthy = false - err = w.db.UpdateStoreHealth(store) - if err != nil { - panic(fmt.Errorf("failed to update store '%s' record in database: %s", task.StoreUUID, err)) - } - if rc != 0 { - log.Debugf("%s: FAILING task '%s' in database", chore, chore.TaskUUID) + err = w.db.UpdateStoreHealth(store) + if err != nil { + panic(fmt.Errorf("failed to update store '%s' record in database: %s", task.StoreUUID, err)) + } + + // Mark the task as failed w.db.FailTask(chore.TaskUUID, time.Now()) - return } case db.AgentStatusOperation: