diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index be24bc52..5d2fdb23 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -176,7 +176,63 @@ spec: summary: "Fleetshard synchronizer manages `{{ $value }}` centrals." description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md" - + - name: rhacs-aws-quota + rules: + - alert: RHACSCentralDBClustersUtilizationHigh + expr: | + acs_fleetshard_central_db_clusters_used / acs_fleetshard_central_db_clusters_max >= 0.8 + for: 1h + labels: + severity: warning + annotations: + summary: "The number of RDS DB clusters is close to its limit." + description: | + Remaining DB clusters: {{ with query "acs_fleetshard_central_db_clusters_max - acs_fleetshard_central_db_clusters_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase should be requested from AWS. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" + - alert: RHACSCentralDBInstancesUtilizationHigh + expr: | + acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.8 + for: 1h + labels: + severity: warning + annotations: + summary: "The number of RDS DB instances is close to its limit." + description: | + Remaining DB instances: {{ with query "acs_fleetshard_central_db_instances_max - acs_fleetshard_central_db_instances_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase should be requested from AWS. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" + - alert: RHACSCentralDBManualSnapshotsUtilizationHigh + expr: | + acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "The number of RDS DB snapshots is close to its limit." + description: | + Remaining DB manual snapshots: {{ with query "acs_fleetshard_central_db_snapshots_max - acs_fleetshard_central_db_snapshots_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase should be requested from AWS. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" + - alert: RHACSCentralDBClustersUtilizationCritical + expr: | + acs_fleetshard_central_db_clusters_used / acs_fleetshard_central_db_clusters_max >= 0.9 + for: 1h + labels: + severity: critical + annotations: + summary: "The number of RDS DB clusters is very close to its limit." + description: | + Remaining DB clusters: {{ with query "acs_fleetshard_central_db_clusters_max - acs_fleetshard_central_db_clusters_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase must be requested from AWS. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" + - alert: RHACSCentralDBInstancesUtilizationCritical + expr: | + acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.9 + for: 1h + labels: + severity: critical + annotations: + summary: "The number of RDS DB instances is very close to its limit." + description: | + Remaining DB instances: {{ with query "acs_fleetshard_central_db_instances_max - acs_fleetshard_central_db_instances_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase must be requested from AWS. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - name: rhacs-probe rules: - alert: RHACSProbeRunFailed diff --git a/resources/prometheus/unit_tests/RHACSAWSQuota.yaml b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml new file mode 100644 index 00000000..df62ff9e --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml @@ -0,0 +1,98 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: acs_fleetshard_central_db_clusters_used + values: "81+0x70 91+0x110" + - series: acs_fleetshard_central_db_clusters_max + values: "100+0x140 150+0x50" + alert_rule_test: + - eval_time: 50m + alertname: RHACSCentralDBClustersUtilizationHigh + exp_alerts: [] + - eval_time: 61m + alertname: RHACSCentralDBClustersUtilizationHigh + exp_alerts: + - exp_labels: + alertname: RHACSCentralDBClustersUtilizationHigh + severity: warning + exp_annotations: + summary: "The number of RDS DB clusters is close to its limit." + description: "Remaining DB clusters: 19. A quota increase should be requested from AWS.\n" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" + - eval_time: 131m + alertname: RHACSCentralDBClustersUtilizationCritical + exp_alerts: + - exp_labels: + alertname: RHACSCentralDBClustersUtilizationCritical + severity: critical + exp_annotations: + summary: "The number of RDS DB clusters is very close to its limit." + description: "Remaining DB clusters: 9. A quota increase must be requested from AWS.\n" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" + - eval_time: 141m + alertname: RHACSCentralDBClustersUtilizationCritical + exp_alerts: + [] + - interval: 1m + input_series: + - series: acs_fleetshard_central_db_instances_used + values: "165+0x70 182+0x110" + - series: acs_fleetshard_central_db_instances_max + values: "200+0x140 300+0x50" + alert_rule_test: + - eval_time: 50m + alertname: RHACSCentralDBInstancesUtilizationHigh + exp_alerts: [] + - eval_time: 61m + alertname: RHACSCentralDBInstancesUtilizationHigh + exp_alerts: + - exp_labels: + alertname: RHACSCentralDBInstancesUtilizationHigh + severity: warning + exp_annotations: + summary: "The number of RDS DB instances is close to its limit." + description: "Remaining DB instances: 35. A quota increase should be requested from AWS.\n" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" + - eval_time: 131m + alertname: RHACSCentralDBInstancesUtilizationCritical + exp_alerts: + - exp_labels: + alertname: RHACSCentralDBInstancesUtilizationCritical + severity: critical + exp_annotations: + summary: "The number of RDS DB instances is very close to its limit." + description: "Remaining DB instances: 18. A quota increase must be requested from AWS.\n" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" + - eval_time: 141m + alertname: RHACSCentralDBInstancesUtilizationCritical + exp_alerts: + [] + - interval: 1m + input_series: + - series: acs_fleetshard_central_db_snapshots_used + values: "801+0x60 901+0x60" + - series: acs_fleetshard_central_db_snapshots_max + values: "1000+0x70 1500+0x50" + alert_rule_test: + - eval_time: 2m + alertname: RHACSCentralDBManualSnapshotsUtilizationHigh + exp_alerts: [] + - eval_time: 7m + alertname: RHACSCentralDBManualSnapshotsUtilizationHigh + exp_alerts: + - exp_labels: + alertname: RHACSCentralDBInstancesUtilizationHigh + severity: warning + exp_annotations: + summary: "The number of RDS DB snapshots is close to its limit." + description: "Remaining DB manual snapshots: 199. A quota increase should be requested from AWS.\n" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" + - eval_time: 71m + alertname: RHACSCentralDBManualSnapshotsUtilizationHigh + exp_alerts: + []