Skip to content

Commit

Permalink
Merge pull request #99 from stackrox/vbologa/rds_quota_alerts
Browse files Browse the repository at this point in the history
ROX-16561: Add alerts for AWS quotas
  • Loading branch information
vladbologa authored May 31, 2023
2 parents 219bb2d + dabbee6 commit 3a52a34
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 1 deletion.
58 changes: 57 additions & 1 deletion resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,63 @@ spec:
summary: "Fleetshard synchronizer manages `{{ $value }}` centrals."
description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md"

- name: rhacs-aws-quota
rules:
- alert: RHACSCentralDBClustersUtilizationHigh
expr: |
acs_fleetshard_central_db_clusters_used / acs_fleetshard_central_db_clusters_max >= 0.8
for: 1h
labels:
severity: warning
annotations:
summary: "The number of RDS DB clusters is close to its limit."
description: |
Remaining DB clusters: {{ with query "acs_fleetshard_central_db_clusters_max - acs_fleetshard_central_db_clusters_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase should be requested from AWS.
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md"
- alert: RHACSCentralDBInstancesUtilizationHigh
expr: |
acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.8
for: 1h
labels:
severity: warning
annotations:
summary: "The number of RDS DB instances is close to its limit."
description: |
Remaining DB instances: {{ with query "acs_fleetshard_central_db_instances_max - acs_fleetshard_central_db_instances_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase should be requested from AWS.
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md"
- alert: RHACSCentralDBManualSnapshotsUtilizationHigh
expr: |
acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "The number of RDS DB snapshots is close to its limit."
description: |
Remaining DB manual snapshots: {{ with query "acs_fleetshard_central_db_snapshots_max - acs_fleetshard_central_db_snapshots_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase should be requested from AWS.
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md"
- alert: RHACSCentralDBClustersUtilizationCritical
expr: |
acs_fleetshard_central_db_clusters_used / acs_fleetshard_central_db_clusters_max >= 0.9
for: 1h
labels:
severity: critical
annotations:
summary: "The number of RDS DB clusters is very close to its limit."
description: |
Remaining DB clusters: {{ with query "acs_fleetshard_central_db_clusters_max - acs_fleetshard_central_db_clusters_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase must be requested from AWS.
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md"
- alert: RHACSCentralDBInstancesUtilizationCritical
expr: |
acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.9
for: 1h
labels:
severity: critical
annotations:
summary: "The number of RDS DB instances is very close to its limit."
description: |
Remaining DB instances: {{ with query "acs_fleetshard_central_db_instances_max - acs_fleetshard_central_db_instances_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase must be requested from AWS.
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md"
- name: rhacs-probe
rules:
- alert: RHACSProbeRunFailed
Expand Down
98 changes: 98 additions & 0 deletions resources/prometheus/unit_tests/RHACSAWSQuota.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: acs_fleetshard_central_db_clusters_used
values: "81+0x70 91+0x110"
- series: acs_fleetshard_central_db_clusters_max
values: "100+0x140 150+0x50"
alert_rule_test:
- eval_time: 50m
alertname: RHACSCentralDBClustersUtilizationHigh
exp_alerts: []
- eval_time: 61m
alertname: RHACSCentralDBClustersUtilizationHigh
exp_alerts:
- exp_labels:
alertname: RHACSCentralDBClustersUtilizationHigh
severity: warning
exp_annotations:
summary: "The number of RDS DB clusters is close to its limit."
description: "Remaining DB clusters: 19. A quota increase should be requested from AWS.\n"
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md"
- eval_time: 131m
alertname: RHACSCentralDBClustersUtilizationCritical
exp_alerts:
- exp_labels:
alertname: RHACSCentralDBClustersUtilizationCritical
severity: critical
exp_annotations:
summary: "The number of RDS DB clusters is very close to its limit."
description: "Remaining DB clusters: 9. A quota increase must be requested from AWS.\n"
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md"
- eval_time: 141m
alertname: RHACSCentralDBClustersUtilizationCritical
exp_alerts:
[]
- interval: 1m
input_series:
- series: acs_fleetshard_central_db_instances_used
values: "165+0x70 182+0x110"
- series: acs_fleetshard_central_db_instances_max
values: "200+0x140 300+0x50"
alert_rule_test:
- eval_time: 50m
alertname: RHACSCentralDBInstancesUtilizationHigh
exp_alerts: []
- eval_time: 61m
alertname: RHACSCentralDBInstancesUtilizationHigh
exp_alerts:
- exp_labels:
alertname: RHACSCentralDBInstancesUtilizationHigh
severity: warning
exp_annotations:
summary: "The number of RDS DB instances is close to its limit."
description: "Remaining DB instances: 35. A quota increase should be requested from AWS.\n"
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md"
- eval_time: 131m
alertname: RHACSCentralDBInstancesUtilizationCritical
exp_alerts:
- exp_labels:
alertname: RHACSCentralDBInstancesUtilizationCritical
severity: critical
exp_annotations:
summary: "The number of RDS DB instances is very close to its limit."
description: "Remaining DB instances: 18. A quota increase must be requested from AWS.\n"
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md"
- eval_time: 141m
alertname: RHACSCentralDBInstancesUtilizationCritical
exp_alerts:
[]
- interval: 1m
input_series:
- series: acs_fleetshard_central_db_snapshots_used
values: "801+0x60 901+0x60"
- series: acs_fleetshard_central_db_snapshots_max
values: "1000+0x70 1500+0x50"
alert_rule_test:
- eval_time: 2m
alertname: RHACSCentralDBManualSnapshotsUtilizationHigh
exp_alerts: []
- eval_time: 7m
alertname: RHACSCentralDBManualSnapshotsUtilizationHigh
exp_alerts:
- exp_labels:
alertname: RHACSCentralDBInstancesUtilizationHigh
severity: warning
exp_annotations:
summary: "The number of RDS DB snapshots is close to its limit."
description: "Remaining DB manual snapshots: 199. A quota increase should be requested from AWS.\n"
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md"
- eval_time: 71m
alertname: RHACSCentralDBManualSnapshotsUtilizationHigh
exp_alerts:
[]

0 comments on commit 3a52a34

Please sign in to comment.