From a7860dc4b223adc3f17a727022947923131fde01 Mon Sep 17 00:00:00 2001 From: Vlad Bologa Date: Fri, 26 May 2023 11:40:44 +0200 Subject: [PATCH 1/9] ROX-16561: Add alerts for AWS quotas --- resources/prometheus/prometheus-rules.yaml | 65 +++++++++++++++++++++- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index be24bc52..0bf28c97 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -175,8 +175,69 @@ spec: annotations: summary: "Fleetshard synchronizer manages `{{ $value }}` centrals." description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md" - + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md" + - name: rhacs-aws-quota + rules: + - alert: RHACSCentralDBClustersUtilizationHigh + expr: | + expr: acs_fleetshard_central_db_clusters_used / acs_fleetshard_central_db_clusters_max >= 0.8 + for: 1h + labels: + severity: warning + annotations: + summary: "The number of RDS DB clusters is close to its limit." + description: "The number of AWS RDS DB clusters is close to its limit. A quota increase should be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + - alert: RHACSCentralDBInstancesUtilizationHigh + expr: | + expr: acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.8 + for: 1h + labels: + severity: warning + annotations: + summary: "The number of RDS DB instances is close to its limit." + description: "The number of AWS RDS DB instances is close to its limit. A quota increase should be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + - alert: RHACSCentralDBManualSnapshotsUtilizationHigh + expr: | + expr: acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.8 + for: 1h + labels: + severity: warning + annotations: + summary: "The number of RDS DB snapshots is close to its limit." + description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase should be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + - alert: RHACSCentralDBClustersUtilizationCritical + expr: | + expr: acs_fleetshard_central_db_clusters_used / acs_fleetshard_central_db_clusters_max >= 0.9 + for: 1h + labels: + severity: critical + annotations: + summary: "The number of RDS DB clusters is very close to its limit." + description: "The number of AWS RDS DB clusters is close to its limit. A quota increase must be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + - alert: RHACSCentralDBInstancesUtilizationCritical + expr: | + expr: acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.9 + for: 1h + labels: + severity: critical + annotations: + summary: "The number of RDS DB instances is very close to its limit." + description: "The number of AWS RDS DB instances is close to its limit. A quota increase must be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + - alert: RHACSCentralDBManualSnapshotsUtilizationCritical + expr: | + expr: acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.9 + for: 1h + labels: + severity: critical + annotations: + summary: "The number of RDS DB snapshots is very close to its limit." + description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase must be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" - name: rhacs-probe rules: - alert: RHACSProbeRunFailed From 75c912998022fef5a30f9eceb35ff33deaf984b5 Mon Sep 17 00:00:00 2001 From: Vlad Bologa Date: Fri, 26 May 2023 11:50:20 +0200 Subject: [PATCH 2/9] Adjust time interval for DB snapshots --- resources/prometheus/prometheus-rules.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 0bf28c97..83dad061 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -201,7 +201,7 @@ spec: - alert: RHACSCentralDBManualSnapshotsUtilizationHigh expr: | expr: acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.8 - for: 1h + for: 5m labels: severity: warning annotations: @@ -231,7 +231,7 @@ spec: - alert: RHACSCentralDBManualSnapshotsUtilizationCritical expr: | expr: acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.9 - for: 1h + for: 5m labels: severity: critical annotations: From a9967d1691c759f752afdefd3f400a6526d8a584 Mon Sep 17 00:00:00 2001 From: Vlad Bologa Date: Fri, 26 May 2023 12:35:51 +0200 Subject: [PATCH 3/9] Fixes --- resources/prometheus/prometheus-rules.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 83dad061..ffe95b89 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -180,7 +180,7 @@ spec: rules: - alert: RHACSCentralDBClustersUtilizationHigh expr: | - expr: acs_fleetshard_central_db_clusters_used / acs_fleetshard_central_db_clusters_max >= 0.8 + acs_fleetshard_central_db_clusters_used / acs_fleetshard_central_db_clusters_max >= 0.8 for: 1h labels: severity: warning @@ -190,7 +190,7 @@ spec: sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" - alert: RHACSCentralDBInstancesUtilizationHigh expr: | - expr: acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.8 + acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.8 for: 1h labels: severity: warning @@ -200,7 +200,7 @@ spec: sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" - alert: RHACSCentralDBManualSnapshotsUtilizationHigh expr: | - expr: acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.8 + acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.8 for: 5m labels: severity: warning @@ -210,7 +210,7 @@ spec: sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" - alert: RHACSCentralDBClustersUtilizationCritical expr: | - expr: acs_fleetshard_central_db_clusters_used / acs_fleetshard_central_db_clusters_max >= 0.9 + acs_fleetshard_central_db_clusters_used / acs_fleetshard_central_db_clusters_max >= 0.9 for: 1h labels: severity: critical @@ -220,7 +220,7 @@ spec: sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" - alert: RHACSCentralDBInstancesUtilizationCritical expr: | - expr: acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.9 + acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.9 for: 1h labels: severity: critical @@ -230,7 +230,7 @@ spec: sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" - alert: RHACSCentralDBManualSnapshotsUtilizationCritical expr: | - expr: acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.9 + acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.9 for: 5m labels: severity: critical From 059d14e4fef4821d76e6cbff67c6d3a7e8321669 Mon Sep 17 00:00:00 2001 From: Vlad Bologa Date: Fri, 26 May 2023 13:12:23 +0200 Subject: [PATCH 4/9] Fix whitespace --- resources/prometheus/prometheus-rules.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index ffe95b89..8bade534 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -175,7 +175,7 @@ spec: annotations: summary: "Fleetshard synchronizer manages `{{ $value }}` centrals." description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md" - name: rhacs-aws-quota rules: - alert: RHACSCentralDBClustersUtilizationHigh From 55509b6431a15cdb843c9eaddfdd9295d5a8565c Mon Sep 17 00:00:00 2001 From: Vlad Bologa Date: Fri, 26 May 2023 15:26:22 +0200 Subject: [PATCH 5/9] Add unit tests --- .../prometheus/unit_tests/RHACSAWSQuota.yaml | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 resources/prometheus/unit_tests/RHACSAWSQuota.yaml diff --git a/resources/prometheus/unit_tests/RHACSAWSQuota.yaml b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml new file mode 100644 index 00000000..4dfd9ec2 --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml @@ -0,0 +1,108 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: acs_fleetshard_central_db_clusters_used + values: "81+0x60 91+0x120" + - series: acs_fleetshard_central_db_clusters_max + values: "100+0x130 150+0x50" + alert_rule_test: + - eval_time: 50m + alertname: RHACSCentralDBClustersUtilizationHigh + exp_alerts: [] + - eval_time: 61m + alertname: RHACSCentralDBClustersUtilizationHigh + exp_alerts: + - exp_labels: + alertname: RHACSCentralDBClustersUtilizationHigh + severity: warning + exp_annotations: + summary: "The number of RDS DB clusters is close to its limit." + description: "The number of AWS RDS DB clusters is close to its limit. A quota increase should be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + - eval_time: 121m + alertname: RHACSCentralDBClustersUtilizationCritical + exp_alerts: + - exp_labels: + alertname: RHACSCentralDBClustersUtilizationCritical + severity: critical + exp_annotations: + summary: "The number of RDS DB clusters is very close to its limit." + description: "The number of AWS RDS DB clusters is close to its limit. A quota increase must be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + - eval_time: 131m + alertname: RHACSCentralDBClustersUtilizationCritical + exp_alerts: + [] + - interval: 1m + input_series: + - series: acs_fleetshard_central_db_instances_used + values: "81+0x60 91+0x120" + - series: acs_fleetshard_central_db_instances_max + values: "100+0x130 150+0x50" + alert_rule_test: + - eval_time: 50m + alertname: RHACSCentralDBInstancesUtilizationHigh + exp_alerts: [] + - eval_time: 61m + alertname: RHACSCentralDBInstancesUtilizationHigh + exp_alerts: + - exp_labels: + alertname: RHACSCentralDBInstancesUtilizationHigh + severity: warning + exp_annotations: + summary: "The number of RDS DB instances is close to its limit." + description: "The number of AWS RDS DB instances is close to its limit. A quota increase should be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + - eval_time: 121m + alertname: RHACSCentralDBInstancesUtilizationCritical + exp_alerts: + - exp_labels: + alertname: RHACSCentralDBInstancesUtilizationCritical + severity: critical + exp_annotations: + summary: "The number of RDS DB instances is very close to its limit." + description: "The number of AWS RDS DB instances is close to its limit. A quota increase must be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + - eval_time: 131m + alertname: RHACSCentralDBInstancesUtilizationCritical + exp_alerts: + [] + - interval: 1m + input_series: + - series: acs_fleetshard_central_db_snapshots_used + values: "81+0x60 91+0x120" + - series: acs_fleetshard_central_db_snapshots_max + values: "100+0x130 150+0x50" + alert_rule_test: + - eval_time: 2m + alertname: RHACSCentralDBManualSnapshotsUtilizationHigh + exp_alerts: [] + - eval_time: 7m + alertname: RHACSCentralDBManualSnapshotsUtilizationHigh + exp_alerts: + - exp_labels: + alertname: RHACSCentralDBInstancesUtilizationHigh + severity: warning + exp_annotations: + summary: "The number of RDS DB snapshots is close to its limit." + description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase should be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + - eval_time: 66m + alertname: RHACSCentralDBManualSnapshotsUtilizationCritical + exp_alerts: + - exp_labels: + alertname: RHACSCentralDBManualSnapshotsUtilizationCritical + severity: critical + exp_annotations: + summary: "The number of RDS DB snapshots is very close to its limit." + description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase must be requested from AWS." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + - eval_time: 131m + alertname: RHACSCentralDBManualSnapshotsUtilizationCritical + exp_alerts: + [] \ No newline at end of file From 6529bc55eda333b1faee17bf24ebb550c0c556d1 Mon Sep 17 00:00:00 2001 From: Vlad Bologa Date: Fri, 26 May 2023 15:44:45 +0200 Subject: [PATCH 6/9] Newline --- resources/prometheus/unit_tests/RHACSAWSQuota.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/prometheus/unit_tests/RHACSAWSQuota.yaml b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml index 4dfd9ec2..2f4f8c3b 100644 --- a/resources/prometheus/unit_tests/RHACSAWSQuota.yaml +++ b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml @@ -105,4 +105,4 @@ tests: - eval_time: 131m alertname: RHACSCentralDBManualSnapshotsUtilizationCritical exp_alerts: - [] \ No newline at end of file + [] From cb812589224c1fe7c12abec5cb9c0717e5472cfe Mon Sep 17 00:00:00 2001 From: Vlad Bologa Date: Fri, 26 May 2023 18:35:13 +0200 Subject: [PATCH 7/9] Update SOP file name --- resources/prometheus/prometheus-rules.yaml | 12 ++++++------ resources/prometheus/unit_tests/RHACSAWSQuota.yaml | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 8bade534..b7f9fc30 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -187,7 +187,7 @@ spec: annotations: summary: "The number of RDS DB clusters is close to its limit." description: "The number of AWS RDS DB clusters is close to its limit. A quota increase should be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - alert: RHACSCentralDBInstancesUtilizationHigh expr: | acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.8 @@ -197,7 +197,7 @@ spec: annotations: summary: "The number of RDS DB instances is close to its limit." description: "The number of AWS RDS DB instances is close to its limit. A quota increase should be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - alert: RHACSCentralDBManualSnapshotsUtilizationHigh expr: | acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.8 @@ -207,7 +207,7 @@ spec: annotations: summary: "The number of RDS DB snapshots is close to its limit." description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase should be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - alert: RHACSCentralDBClustersUtilizationCritical expr: | acs_fleetshard_central_db_clusters_used / acs_fleetshard_central_db_clusters_max >= 0.9 @@ -217,7 +217,7 @@ spec: annotations: summary: "The number of RDS DB clusters is very close to its limit." description: "The number of AWS RDS DB clusters is close to its limit. A quota increase must be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - alert: RHACSCentralDBInstancesUtilizationCritical expr: | acs_fleetshard_central_db_instances_used / acs_fleetshard_central_db_instances_max >= 0.9 @@ -227,7 +227,7 @@ spec: annotations: summary: "The number of RDS DB instances is very close to its limit." description: "The number of AWS RDS DB instances is close to its limit. A quota increase must be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - alert: RHACSCentralDBManualSnapshotsUtilizationCritical expr: | acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.9 @@ -237,7 +237,7 @@ spec: annotations: summary: "The number of RDS DB snapshots is very close to its limit." description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase must be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - name: rhacs-probe rules: - alert: RHACSProbeRunFailed diff --git a/resources/prometheus/unit_tests/RHACSAWSQuota.yaml b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml index 2f4f8c3b..b8b2aa05 100644 --- a/resources/prometheus/unit_tests/RHACSAWSQuota.yaml +++ b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml @@ -23,7 +23,7 @@ tests: exp_annotations: summary: "The number of RDS DB clusters is close to its limit." description: "The number of AWS RDS DB clusters is close to its limit. A quota increase should be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - eval_time: 121m alertname: RHACSCentralDBClustersUtilizationCritical exp_alerts: @@ -33,7 +33,7 @@ tests: exp_annotations: summary: "The number of RDS DB clusters is very close to its limit." description: "The number of AWS RDS DB clusters is close to its limit. A quota increase must be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - eval_time: 131m alertname: RHACSCentralDBClustersUtilizationCritical exp_alerts: @@ -57,7 +57,7 @@ tests: exp_annotations: summary: "The number of RDS DB instances is close to its limit." description: "The number of AWS RDS DB instances is close to its limit. A quota increase should be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - eval_time: 121m alertname: RHACSCentralDBInstancesUtilizationCritical exp_alerts: @@ -67,7 +67,7 @@ tests: exp_annotations: summary: "The number of RDS DB instances is very close to its limit." description: "The number of AWS RDS DB instances is close to its limit. A quota increase must be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - eval_time: 131m alertname: RHACSCentralDBInstancesUtilizationCritical exp_alerts: @@ -91,7 +91,7 @@ tests: exp_annotations: summary: "The number of RDS DB snapshots is close to its limit." description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase should be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - eval_time: 66m alertname: RHACSCentralDBManualSnapshotsUtilizationCritical exp_alerts: @@ -101,7 +101,7 @@ tests: exp_annotations: summary: "The number of RDS DB snapshots is very close to its limit." description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase must be requested from AWS." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-limits.md" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - eval_time: 131m alertname: RHACSCentralDBManualSnapshotsUtilizationCritical exp_alerts: From 8871f10aca85f904fa28e94ce4ee754918db0a5a Mon Sep 17 00:00:00 2001 From: Vlad Bologa Date: Tue, 30 May 2023 14:28:59 +0200 Subject: [PATCH 8/9] Display items left in description --- resources/prometheus/prometheus-rules.yaml | 18 +++++++---- .../prometheus/unit_tests/RHACSAWSQuota.yaml | 32 +++++++++---------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index b7f9fc30..060a171b 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -186,7 +186,8 @@ spec: severity: warning annotations: summary: "The number of RDS DB clusters is close to its limit." - description: "The number of AWS RDS DB clusters is close to its limit. A quota increase should be requested from AWS." + description: | + Remaining DB clusters: {{ with query "acs_fleetshard_central_db_clusters_max - acs_fleetshard_central_db_clusters_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase should be requested from AWS. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - alert: RHACSCentralDBInstancesUtilizationHigh expr: | @@ -196,7 +197,8 @@ spec: severity: warning annotations: summary: "The number of RDS DB instances is close to its limit." - description: "The number of AWS RDS DB instances is close to its limit. A quota increase should be requested from AWS." + description: | + Remaining DB instances: {{ with query "acs_fleetshard_central_db_instances_max - acs_fleetshard_central_db_instances_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase should be requested from AWS. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - alert: RHACSCentralDBManualSnapshotsUtilizationHigh expr: | @@ -206,7 +208,8 @@ spec: severity: warning annotations: summary: "The number of RDS DB snapshots is close to its limit." - description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase should be requested from AWS." + description: | + Remaining DB manual snapshots: {{ with query "acs_fleetshard_central_db_snapshots_max - acs_fleetshard_central_db_snapshots_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase should be requested from AWS. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - alert: RHACSCentralDBClustersUtilizationCritical expr: | @@ -216,7 +219,8 @@ spec: severity: critical annotations: summary: "The number of RDS DB clusters is very close to its limit." - description: "The number of AWS RDS DB clusters is close to its limit. A quota increase must be requested from AWS." + description: | + Remaining DB clusters: {{ with query "acs_fleetshard_central_db_clusters_max - acs_fleetshard_central_db_clusters_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase must be requested from AWS. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - alert: RHACSCentralDBInstancesUtilizationCritical expr: | @@ -226,7 +230,8 @@ spec: severity: critical annotations: summary: "The number of RDS DB instances is very close to its limit." - description: "The number of AWS RDS DB instances is close to its limit. A quota increase must be requested from AWS." + description: | + Remaining DB instances: {{ with query "acs_fleetshard_central_db_instances_max - acs_fleetshard_central_db_instances_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase must be requested from AWS. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - alert: RHACSCentralDBManualSnapshotsUtilizationCritical expr: | @@ -236,7 +241,8 @@ spec: severity: critical annotations: summary: "The number of RDS DB snapshots is very close to its limit." - description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase must be requested from AWS." + description: | + Remaining DB manual snapshots: {{ with query "acs_fleetshard_central_db_snapshots_max - acs_fleetshard_central_db_snapshots_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase must be requested from AWS. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - name: rhacs-probe rules: diff --git a/resources/prometheus/unit_tests/RHACSAWSQuota.yaml b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml index b8b2aa05..83b55361 100644 --- a/resources/prometheus/unit_tests/RHACSAWSQuota.yaml +++ b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml @@ -7,9 +7,9 @@ tests: - interval: 1m input_series: - series: acs_fleetshard_central_db_clusters_used - values: "81+0x60 91+0x120" + values: "81+0x70 91+0x110" - series: acs_fleetshard_central_db_clusters_max - values: "100+0x130 150+0x50" + values: "100+0x140 150+0x50" alert_rule_test: - eval_time: 50m alertname: RHACSCentralDBClustersUtilizationHigh @@ -22,9 +22,9 @@ tests: severity: warning exp_annotations: summary: "The number of RDS DB clusters is close to its limit." - description: "The number of AWS RDS DB clusters is close to its limit. A quota increase should be requested from AWS." + description: "Remaining DB clusters: 19. A quota increase should be requested from AWS.\n" sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - - eval_time: 121m + - eval_time: 131m alertname: RHACSCentralDBClustersUtilizationCritical exp_alerts: - exp_labels: @@ -32,18 +32,18 @@ tests: severity: critical exp_annotations: summary: "The number of RDS DB clusters is very close to its limit." - description: "The number of AWS RDS DB clusters is close to its limit. A quota increase must be requested from AWS." + description: "Remaining DB clusters: 9. A quota increase must be requested from AWS.\n" sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - - eval_time: 131m + - eval_time: 141m alertname: RHACSCentralDBClustersUtilizationCritical exp_alerts: [] - interval: 1m input_series: - series: acs_fleetshard_central_db_instances_used - values: "81+0x60 91+0x120" + values: "165+0x70 182+0x110" - series: acs_fleetshard_central_db_instances_max - values: "100+0x130 150+0x50" + values: "200+0x140 300+0x50" alert_rule_test: - eval_time: 50m alertname: RHACSCentralDBInstancesUtilizationHigh @@ -56,9 +56,9 @@ tests: severity: warning exp_annotations: summary: "The number of RDS DB instances is close to its limit." - description: "The number of AWS RDS DB instances is close to its limit. A quota increase should be requested from AWS." + description: "Remaining DB instances: 35. A quota increase should be requested from AWS.\n" sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - - eval_time: 121m + - eval_time: 131m alertname: RHACSCentralDBInstancesUtilizationCritical exp_alerts: - exp_labels: @@ -66,18 +66,18 @@ tests: severity: critical exp_annotations: summary: "The number of RDS DB instances is very close to its limit." - description: "The number of AWS RDS DB instances is close to its limit. A quota increase must be requested from AWS." + description: "Remaining DB instances: 18. A quota increase must be requested from AWS.\n" sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - - eval_time: 131m + - eval_time: 141m alertname: RHACSCentralDBInstancesUtilizationCritical exp_alerts: [] - interval: 1m input_series: - series: acs_fleetshard_central_db_snapshots_used - values: "81+0x60 91+0x120" + values: "801+0x60 901+0x120" - series: acs_fleetshard_central_db_snapshots_max - values: "100+0x130 150+0x50" + values: "1000+0x130 1500+0x50" alert_rule_test: - eval_time: 2m alertname: RHACSCentralDBManualSnapshotsUtilizationHigh @@ -90,7 +90,7 @@ tests: severity: warning exp_annotations: summary: "The number of RDS DB snapshots is close to its limit." - description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase should be requested from AWS." + description: "Remaining DB manual snapshots: 199. A quota increase should be requested from AWS.\n" sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - eval_time: 66m alertname: RHACSCentralDBManualSnapshotsUtilizationCritical @@ -100,7 +100,7 @@ tests: severity: critical exp_annotations: summary: "The number of RDS DB snapshots is very close to its limit." - description: "The number of AWS RDS DB manual snapshots is close to its limit. A quota increase must be requested from AWS." + description: "Remaining DB manual snapshots: 99. A quota increase must be requested from AWS.\n" sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - eval_time: 131m alertname: RHACSCentralDBManualSnapshotsUtilizationCritical From dabbee6032aee80e0cf75c3074737fae7ae78c42 Mon Sep 17 00:00:00 2001 From: Vlad Bologa Date: Tue, 30 May 2023 18:16:31 +0200 Subject: [PATCH 9/9] Remove DB snapshots critical alert --- resources/prometheus/prometheus-rules.yaml | 11 ----------- .../prometheus/unit_tests/RHACSAWSQuota.yaml | 18 ++++-------------- 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 060a171b..5d2fdb23 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -233,17 +233,6 @@ spec: description: | Remaining DB instances: {{ with query "acs_fleetshard_central_db_instances_max - acs_fleetshard_central_db_instances_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase must be requested from AWS. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - - alert: RHACSCentralDBManualSnapshotsUtilizationCritical - expr: | - acs_fleetshard_central_db_snapshots_used / acs_fleetshard_central_db_snapshots_max >= 0.9 - for: 5m - labels: - severity: critical - annotations: - summary: "The number of RDS DB snapshots is very close to its limit." - description: | - Remaining DB manual snapshots: {{ with query "acs_fleetshard_central_db_snapshots_max - acs_fleetshard_central_db_snapshots_used" }}{{ . | first | value | humanize }}{{ end }}. A quota increase must be requested from AWS. - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - name: rhacs-probe rules: - alert: RHACSProbeRunFailed diff --git a/resources/prometheus/unit_tests/RHACSAWSQuota.yaml b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml index 83b55361..df62ff9e 100644 --- a/resources/prometheus/unit_tests/RHACSAWSQuota.yaml +++ b/resources/prometheus/unit_tests/RHACSAWSQuota.yaml @@ -75,9 +75,9 @@ tests: - interval: 1m input_series: - series: acs_fleetshard_central_db_snapshots_used - values: "801+0x60 901+0x120" + values: "801+0x60 901+0x60" - series: acs_fleetshard_central_db_snapshots_max - values: "1000+0x130 1500+0x50" + values: "1000+0x70 1500+0x50" alert_rule_test: - eval_time: 2m alertname: RHACSCentralDBManualSnapshotsUtilizationHigh @@ -92,17 +92,7 @@ tests: summary: "The number of RDS DB snapshots is close to its limit." description: "Remaining DB manual snapshots: 199. A quota increase should be requested from AWS.\n" sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - - eval_time: 66m - alertname: RHACSCentralDBManualSnapshotsUtilizationCritical - exp_alerts: - - exp_labels: - alertname: RHACSCentralDBManualSnapshotsUtilizationCritical - severity: critical - exp_annotations: - summary: "The number of RDS DB snapshots is very close to its limit." - description: "Remaining DB manual snapshots: 99. A quota increase must be requested from AWS.\n" - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-029-increase-aws-rds-limits.md" - - eval_time: 131m - alertname: RHACSCentralDBManualSnapshotsUtilizationCritical + - eval_time: 71m + alertname: RHACSCentralDBManualSnapshotsUtilizationHigh exp_alerts: []