Skip to content

Commit

Permalink
ROX-15234: Alert when instance has pause-reconcile for more than 2 da…
Browse files Browse the repository at this point in the history
…ys (#102)
  • Loading branch information
vladbologa authored Jun 2, 2023
1 parent 3a52a34 commit b7c00b5
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 0 deletions.
9 changes: 9 additions & 0 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,15 @@ spec:
summary: "Fleetshard synchronizer manages `{{ $value }}` centrals."
description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md"
- alert: RHACSFleetshardSyncCentralReconcilePaused
expr: |
acs_fleetshard_pause_reconcile_instances == 1
for: 2d
labels:
severity: warning
annotations:
summary: "ACS instance {{ $labels.instance }} has paused reconciliation for more than 2 days."
description: "ACS instance {{ $labels.instance }} has the 'pause-reconcile' annotation and therefore is not being managed by the ACS operator. Please check that this is intended."
- name: rhacs-aws-quota
rules:
- alert: RHACSCentralDBClustersUtilizationHigh
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1h

tests:
- interval: 1h
input_series:
- series: acs_fleetshard_pause_reconcile_instances{instance="rhacs-chs64i3dabr0026a6fag"}
values: "0+0x0 1+0x50 0+0x2"
alert_rule_test:
- eval_time: 1h
alertname: RHACSFleetshardSyncCentralReconcilePaused
exp_alerts: []
- eval_time: 30h
alertname: RHACSFleetshardSyncCentralReconcilePaused
exp_alerts: []
- eval_time: 49h
alertname: RHACSFleetshardSyncCentralReconcilePaused
exp_alerts:
- exp_labels:
alertname: RHACSFleetshardSyncCentralReconcilePaused
instance: rhacs-chs64i3dabr0026a6fag
severity: warning
exp_annotations:
summary: "ACS instance rhacs-chs64i3dabr0026a6fag has paused reconciliation for more than 2 days."
description: "ACS instance rhacs-chs64i3dabr0026a6fag has the 'pause-reconcile' annotation and therefore is not being managed by the ACS operator. Please check that this is intended."
- eval_time: 52h
alertname: RHACSFleetshardSyncCentralReconcilePaused
exp_alerts: []

0 comments on commit b7c00b5

Please sign in to comment.