Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for Health State Handling in Storage Health Check #774

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions bin/shield-pipe
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,10 @@ EOF
exit 2
fi

# Add a delay to account for eventual consistency
say "waiting 5 seconds to account for storage consistency..."
sleep 5

output=$(${SHIELD_STORE_PLUGIN} retrieve -e "${SHIELD_STORE_ENDPOINT}" -k "${key}" || true)
${SHIELD_STORE_PLUGIN} purge -e "${SHIELD_STORE_ENDPOINT}" -k "${key}" || true

Expand Down
64 changes: 44 additions & 20 deletions core/scheduler/chore.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,8 @@ func (w *Worker) Execute(chore Chore) {
w.db.UpdateTaskLog(task.UUID, "\n\n")

case db.TestStoreOperation:
// DEBUG (uncomment to use) Print out the raw output that we are about to parse
// w.db.UpdateTaskLog(task.UUID, fmt.Sprintf("\nDEBUG: Attempting to parse output:\n%s\n", output))
var v struct {
Healthy bool `json:"healthy"`
}
Expand All @@ -302,36 +304,58 @@ func (w *Worker) Execute(chore Chore) {
if store == nil {
panic(fmt.Errorf("store '%s' not found in database", task.StoreUUID))
}
// To account for multiple retries, we'll parse the last line of the output to determine the health of the store
lines := strings.Split(strings.TrimSpace(output), "\n")
lastLine := lines[len(lines)-1]

if rc == 0 {
err = json.Unmarshal([]byte(output), &v)
// Attempt to parse {"healthy":true} or {"healthy":false} from `output`
err = json.Unmarshal([]byte(lastLine), &v)
if err != nil {
// If we can't parse it, assume final result is unhealthy
w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: unable to parse script output; marking storage system as UNHEALTHY.\n")
// If parsing fails, also log the error
w.db.UpdateTaskLog(task.UUID, fmt.Sprintf("DEBUG: json.Unmarshal failed: %s\n", err))
store.Healthy = false

// Update DB record
err = w.db.UpdateStoreHealth(store)
if err != nil {
panic(fmt.Errorf("failed to unmarshal output [%s] from %s operation: %s", output, task.Op, err))
panic(fmt.Errorf("failed to update store '%s' record in database: %s", task.StoreUUID, err))
}
if v.Healthy {
if store.Healthy != v.Healthy {
w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: marking storage system as HEALTHY (recovery).\n")
} else {
w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: storage is still HEALTHY.\n")
}

// Mark the task as failed
w.db.FailTask(chore.TaskUUID, time.Now())
return
}

// Now, if v.Healthy == true, we mark the store healthy; otherwise, unhealthy
if v.Healthy {
// DEBUG (uncomment to use) Print out the result of the JSON parse
// w.db.UpdateTaskLog(task.UUID, fmt.Sprintf("DEBUG: Successfully parsed JSON. v.Healthy = %t\n", v.Healthy))
if !store.Healthy {
w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: marking storage system as HEALTHY (recovery).\n")
} else {
w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: marking storage system as UNHEALTHY.\n")
w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: storage is still HEALTHY.\n")
}
store.Healthy = true

err = w.db.UpdateStoreHealth(store)
if err != nil {
panic(fmt.Errorf("failed to update store '%s' record in database: %s", task.StoreUUID, err))
}
store.Healthy = v.Healthy
w.db.CompleteTask(chore.TaskUUID, time.Now())

} else {
store.Healthy = false
w.db.UpdateTaskLog(task.UUID, "\nTEST-STORE: marking storage system as UNHEALTHY.\n")
}
store.Healthy = false

err = w.db.UpdateStoreHealth(store)
if err != nil {
panic(fmt.Errorf("failed to update store '%s' record in database: %s", task.StoreUUID, err))
}
if rc != 0 {
log.Debugf("%s: FAILING task '%s' in database", chore, chore.TaskUUID)
err = w.db.UpdateStoreHealth(store)
if err != nil {
panic(fmt.Errorf("failed to update store '%s' record in database: %s", task.StoreUUID, err))
}

// Mark the task as failed
w.db.FailTask(chore.TaskUUID, time.Now())
return
}

case db.AgentStatusOperation:
Expand Down