From 2d8544c08fe2f5665bc6bd109715108b361f00f0 Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Thu, 8 Aug 2024 13:58:44 +0200 Subject: [PATCH 01/16] Certificate Expiring Critical + Warning --- resources/prometheus/prometheus-rules.yaml | 16 +++++++++++ ...leetschardCertificateExpiringCritical.yaml | 28 +++++++++++++++++++ ...ACSFleetschardCertificateExpiringSoon.yaml | 28 +++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringCritical.yaml create mode 100644 resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringSoon.yaml diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index a9f9f332..58b5530f 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -128,6 +128,22 @@ spec: summary: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times." description: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-005-fleetshard-sync-unavailable.md" + - alert: CertificateExpiringCritical + expr: | + ((acs_fleetshard_certificate_expiration_timestamp-time())/60/60/24)<1 + labels: + severity: critical + annotations: + summary: "Certificate Expiring very soon in namespace `{{ $labels.namespace }}` for secret '{{ $labels.secret}}'." + description: "Certificate `{{ $labels.data_key }}` in namespace `{{ $labels.namespace }}` is expiring in less than a day." + - alert: CertificateExpiringSoon + expr: | + ((acs_fleetshard_certificate_expiration_timestamp-time())/60/60/24)<7 + labels: + severity: warning + annotations: + summary: "Certificate Expiring soon in namespace `{{ $labels.namespace }}` for secret '{{ $labels.secret}}'." + description: "Certificate `{{ $labels.data_key }}` in namespace `{{ $labels.namespace }}` is expiring in less than 7 days." - alert: RHACSFleetshardSyncReconciliationErrors expr: | acs_fleetshard_central_errors_per_reconciliations:ratio_rate10m > 0.10 diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringCritical.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringCritical.yaml new file mode 100644 index 00000000..2f147738 --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringCritical.yaml @@ -0,0 +1,28 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret-1", data_key="key-1"} + values: "700000" #future example + - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret-critical", data_key="key-critical"} + values: "80000" #less than 1 day + alert_rule_test: + - eval_time: 15m + alertname: RHACSFleetshardCertificateExpiringCritical + exp_alerts: [] + - eval_time: 15m + alertname: RHACSFleetshardCertificateExpiringCritical + exp_alerts: + - exp_labels: + alertname: RHACSFleetshardCertificateExpiringCritical + namespace: rhacs + secret: secret-critical + data_key: key-critical + severity: critical + exp_annotations: + summary: "Certificate Expiring very soon in namespace `rhacs` for secret 'secret-critical'." + description: "Certificate `key-critical` in namespace `rhacs` is expiring in less than a day." \ No newline at end of file diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringSoon.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringSoon.yaml new file mode 100644 index 00000000..7400037a --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringSoon.yaml @@ -0,0 +1,28 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret-1", data_key="key-1"} + values: "700000" #future + - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret-warning", data_key="key-warning"} + values: '400000' #less than 7 days + alert_rule_test: + - eval_time: 5m + alertname: RHACSFleetshardCertificateExpiringSoon + exp_alerts: [] + - eval_time: 15m + alertname: RHACSFleetshardCertificateExpiringSoon + exp_alerts: + - exp_labels: + alertname: RHACSFleetshardCertificateExpiringSoon + namespace: rhacs + secret: secret-warning + data_key: key-warning + severity: warning + exp_annotations: + summary: "Certificate Expiring soon in namespace 'rhacs' for secret 'secret-warning'." + description: "Certificate `key-warning` in namespace `rhacs` is expiring in less than 7 days." \ No newline at end of file From 4d4bad87a1586ad58a861bc87b78b4c045175b62 Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Fri, 9 Aug 2024 11:08:44 +0200 Subject: [PATCH 02/16] Certificate Expiring Critical + Warning-updated --- resources/prometheus/prometheus-rules.yaml | 4 +- .../RHACSFleetschardCertificateExpiring.yaml | 42 +++++++++++++++++++ ...leetschardCertificateExpiringCritical.yaml | 28 ------------- ...ACSFleetschardCertificateExpiringSoon.yaml | 28 ------------- 4 files changed, 44 insertions(+), 58 deletions(-) create mode 100644 resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml delete mode 100644 resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringCritical.yaml delete mode 100644 resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringSoon.yaml diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 58b5530f..671cc8c5 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -128,7 +128,7 @@ spec: summary: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times." description: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-005-fleetshard-sync-unavailable.md" - - alert: CertificateExpiringCritical + - alert: RHACSFleetshardCertificateExpiringCritical expr: | ((acs_fleetshard_certificate_expiration_timestamp-time())/60/60/24)<1 labels: @@ -136,7 +136,7 @@ spec: annotations: summary: "Certificate Expiring very soon in namespace `{{ $labels.namespace }}` for secret '{{ $labels.secret}}'." description: "Certificate `{{ $labels.data_key }}` in namespace `{{ $labels.namespace }}` is expiring in less than a day." - - alert: CertificateExpiringSoon + - alert: RHACSFleetshardCertificateExpiringSoon expr: | ((acs_fleetshard_certificate_expiration_timestamp-time())/60/60/24)<7 labels: diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml new file mode 100644 index 00000000..8e0de04b --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml @@ -0,0 +1,42 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1d + input_series: + - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret", data_key="key"} + values: "604800+15x0" # equals to 7 days + + alert_rule_test: + - eval_time: 0 + alertname: RHACSFleetshardCertificateExpiringCritical + exp_alerts: [ ] + - eval_time: 3d + alertname: RHACSFleetshardCertificateExpiringSoon + exp_alerts: + - exp_labels: + alertname: RHACSFleetshardCertificateExpiringSoon + namespace: rhacs + secret: secret + data_key: key + severity: warning + exp_annotations: + summary: "Certificate Expiring soon in namespace `rhacs` for secret 'secret'." + description: "Certificate `key` in namespace `rhacs` is expiring in less than 7 days." + - eval_time: 6d + alertname: RHACSFleetshardCertificateExpiringCritical + exp_alerts: + - exp_labels: + alertname: RHACSFleetshardCertificateExpiringCritical + namespace: rhacs + secret: secret + data_key: key + severity: critical + exp_annotations: + summary: "Certificate Expiring very soon in namespace `rhacs` for secret 'secret'." + description: "Certificate `key` in namespace `rhacs` is expiring in less than a day." + - eval_time: 10d + alertname: RHACSFleetshardCertificateExpiringSoon + exp_alerts: [ ] diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringCritical.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringCritical.yaml deleted file mode 100644 index 2f147738..00000000 --- a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringCritical.yaml +++ /dev/null @@ -1,28 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret-1", data_key="key-1"} - values: "700000" #future example - - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret-critical", data_key="key-critical"} - values: "80000" #less than 1 day - alert_rule_test: - - eval_time: 15m - alertname: RHACSFleetshardCertificateExpiringCritical - exp_alerts: [] - - eval_time: 15m - alertname: RHACSFleetshardCertificateExpiringCritical - exp_alerts: - - exp_labels: - alertname: RHACSFleetshardCertificateExpiringCritical - namespace: rhacs - secret: secret-critical - data_key: key-critical - severity: critical - exp_annotations: - summary: "Certificate Expiring very soon in namespace `rhacs` for secret 'secret-critical'." - description: "Certificate `key-critical` in namespace `rhacs` is expiring in less than a day." \ No newline at end of file diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringSoon.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringSoon.yaml deleted file mode 100644 index 7400037a..00000000 --- a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiringSoon.yaml +++ /dev/null @@ -1,28 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret-1", data_key="key-1"} - values: "700000" #future - - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret-warning", data_key="key-warning"} - values: '400000' #less than 7 days - alert_rule_test: - - eval_time: 5m - alertname: RHACSFleetshardCertificateExpiringSoon - exp_alerts: [] - - eval_time: 15m - alertname: RHACSFleetshardCertificateExpiringSoon - exp_alerts: - - exp_labels: - alertname: RHACSFleetshardCertificateExpiringSoon - namespace: rhacs - secret: secret-warning - data_key: key-warning - severity: warning - exp_annotations: - summary: "Certificate Expiring soon in namespace 'rhacs' for secret 'secret-warning'." - description: "Certificate `key-warning` in namespace `rhacs` is expiring in less than 7 days." \ No newline at end of file From c8e0a9d36e1e9afb244a6b9e16bd0f1a40389547 Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Fri, 9 Aug 2024 13:19:34 +0200 Subject: [PATCH 03/16] Certificate Expiring Critical + Warning-updated --- resources/prometheus/prometheus-rules.yaml | 4 ++-- .../RHACSFleetschardCertificateExpiring.yaml | 20 ++++++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 671cc8c5..7a4e1994 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -130,12 +130,12 @@ spec: sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-005-fleetshard-sync-unavailable.md" - alert: RHACSFleetshardCertificateExpiringCritical expr: | - ((acs_fleetshard_certificate_expiration_timestamp-time())/60/60/24)<1 + ((acs_fleetshard_certificate_expiration_timestamp-time())/60/60/24)<=1 labels: severity: critical annotations: summary: "Certificate Expiring very soon in namespace `{{ $labels.namespace }}` for secret '{{ $labels.secret}}'." - description: "Certificate `{{ $labels.data_key }}` in namespace `{{ $labels.namespace }}` is expiring in less than a day." + description: "Certificate '{{ $labels.data_key }}` in namespace `{{ $labels.namespace }}` expires in '{{ $value | time.Unix | time.Since | humanizeDuration }}'." - alert: RHACSFleetshardCertificateExpiringSoon expr: | ((acs_fleetshard_certificate_expiration_timestamp-time())/60/60/24)<7 diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml index 8e0de04b..e7bb1b58 100644 --- a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml @@ -7,11 +7,11 @@ tests: - interval: 1d input_series: - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret", data_key="key"} - values: "604800+15x0" # equals to 7 days + values: "604800+0x15" # equals to 7 days alert_rule_test: - eval_time: 0 - alertname: RHACSFleetshardCertificateExpiringCritical + alertname: RHACSFleetshardCertificateExpiringSoon exp_alerts: [ ] - eval_time: 3d alertname: RHACSFleetshardCertificateExpiringSoon @@ -36,7 +36,17 @@ tests: severity: critical exp_annotations: summary: "Certificate Expiring very soon in namespace `rhacs` for secret 'secret'." - description: "Certificate `key` in namespace `rhacs` is expiring in less than a day." + description: "Certificate 'key` in namespace `rhacs` expires in '{{ $value | time.Unix | time.Since | humanizeDuration }}'." - eval_time: 10d - alertname: RHACSFleetshardCertificateExpiringSoon - exp_alerts: [ ] + alertname: RHACSFleetshardCertificateExpiringCritical + exp_alerts: + - exp_labels: + alertname: RHACSFleetshardCertificateExpiringCritical + namespace: rhacs + secret: secret + data_key: key + severity: critical + exp_annotations: + summary: "Certificate Expiring very soon in namespace `rhacs` for secret 'secret'." + description: "Certificate 'key` in namespace `rhacs` expires in '{{ $value | time.Unix | time.Since | humanizeDuration }}'." + From dd9679dce71d1ae4daecb8c086e76c9802d97369 Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Fri, 9 Aug 2024 13:57:00 +0200 Subject: [PATCH 04/16] Certificate Expiring Critical + Warning-updated --- resources/prometheus/prometheus-rules.yaml | 4 ++-- .../unit_tests/RHACSFleetschardCertificateExpiring.yaml | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 7a4e1994..38c380ae 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -134,8 +134,8 @@ spec: labels: severity: critical annotations: - summary: "Certificate Expiring very soon in namespace `{{ $labels.namespace }}` for secret '{{ $labels.secret}}'." - description: "Certificate '{{ $labels.data_key }}` in namespace `{{ $labels.namespace }}` expires in '{{ $value | time.Unix | time.Since | humanizeDuration }}'." + summary: "Certificate expiring very soon: '{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." + description: "Certificate '{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on '{{ ($value | toTime).UTC().Format \"2006-02-01\" }}'." - alert: RHACSFleetshardCertificateExpiringSoon expr: | ((acs_fleetshard_certificate_expiration_timestamp-time())/60/60/24)<7 diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml index e7bb1b58..761a8221 100644 --- a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml @@ -35,8 +35,8 @@ tests: data_key: key severity: critical exp_annotations: - summary: "Certificate Expiring very soon in namespace `rhacs` for secret 'secret'." - description: "Certificate 'key` in namespace `rhacs` expires in '{{ $value | time.Unix | time.Since | humanizeDuration }}'." + summary: "Certificate expiring very soon: 'rhacs/secret/key`." + description: "Certificate 'rhacs/secret/key` expires on '1970-01-07'." - eval_time: 10d alertname: RHACSFleetshardCertificateExpiringCritical exp_alerts: @@ -47,6 +47,6 @@ tests: data_key: key severity: critical exp_annotations: - summary: "Certificate Expiring very soon in namespace `rhacs` for secret 'secret'." - description: "Certificate 'key` in namespace `rhacs` expires in '{{ $value | time.Unix | time.Since | humanizeDuration }}'." + summary: "Certificate expiring very soon: 'rhacs/secret/key`." + description: "Certificate 'rhacs/secret/key` expires on '1970-01-11'." From bb1dc01f78ddea317b68cd90ee99a81a24f631cf Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Mon, 12 Aug 2024 12:26:09 +0200 Subject: [PATCH 05/16] updated time check --- resources/prometheus/prometheus-rules.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 38c380ae..ecbde006 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -135,7 +135,7 @@ spec: severity: critical annotations: summary: "Certificate expiring very soon: '{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." - description: "Certificate '{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on '{{ ($value | toTime).UTC().Format \"2006-02-01\" }}'." + description: "Certificate '{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on '{{ ($value | toTime).UTC.Format \"2006-01-02T15:04:05Z07:00\" }}'." - alert: RHACSFleetshardCertificateExpiringSoon expr: | ((acs_fleetshard_certificate_expiration_timestamp-time())/60/60/24)<7 From a2221dd8c353eb04c6f642e0e8892d8f70b31b15 Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Mon, 12 Aug 2024 13:43:32 +0200 Subject: [PATCH 06/16] updated time check --- resources/prometheus/prometheus-rules.yaml | 6 +++--- .../RHACSFleetschardCertificateExpiring.yaml | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index ecbde006..c015ced2 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -130,7 +130,7 @@ spec: sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-005-fleetshard-sync-unavailable.md" - alert: RHACSFleetshardCertificateExpiringCritical expr: | - ((acs_fleetshard_certificate_expiration_timestamp-time())/60/60/24)<=1 + acs_fleetshard_certificate_expiration_timestamp <=1 * 24 * 60 * 60 + time() labels: severity: critical annotations: @@ -138,12 +138,12 @@ spec: description: "Certificate '{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on '{{ ($value | toTime).UTC.Format \"2006-01-02T15:04:05Z07:00\" }}'." - alert: RHACSFleetshardCertificateExpiringSoon expr: | - ((acs_fleetshard_certificate_expiration_timestamp-time())/60/60/24)<7 + acs_fleetshard_certificate_expiration_timestamp <=7* 24 * 60 * 60 + time() labels: severity: warning annotations: summary: "Certificate Expiring soon in namespace `{{ $labels.namespace }}` for secret '{{ $labels.secret}}'." - description: "Certificate `{{ $labels.data_key }}` in namespace `{{ $labels.namespace }}` is expiring in less than 7 days." + description: "Certificate `{{ $labels.data_key }}` in namespace `{{ $labels.namespace }}` is expiring in less than 7 days, on '{{ ($value | toTime).UTC.Format \"2006-01-02T15:04:05Z07:00\" }}'." - alert: RHACSFleetshardSyncReconciliationErrors expr: | acs_fleetshard_central_errors_per_reconciliations:ratio_rate10m > 0.10 diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml index 761a8221..c03a5876 100644 --- a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml @@ -7,7 +7,7 @@ tests: - interval: 1d input_series: - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret", data_key="key"} - values: "604800+0x15" # equals to 7 days + values: "691200+0x15" # equals to 8 days alert_rule_test: - eval_time: 0 @@ -24,8 +24,8 @@ tests: severity: warning exp_annotations: summary: "Certificate Expiring soon in namespace `rhacs` for secret 'secret'." - description: "Certificate `key` in namespace `rhacs` is expiring in less than 7 days." - - eval_time: 6d + description: "Certificate `key` in namespace `rhacs` is expiring in less than 7 days, on '1970-01-09T00:00:00Z'." + - eval_time: 7d alertname: RHACSFleetshardCertificateExpiringCritical exp_alerts: - exp_labels: @@ -36,7 +36,7 @@ tests: severity: critical exp_annotations: summary: "Certificate expiring very soon: 'rhacs/secret/key`." - description: "Certificate 'rhacs/secret/key` expires on '1970-01-07'." + description: "Certificate 'rhacs/secret/key` expires on '1970-01-09T00:00:00Z'." - eval_time: 10d alertname: RHACSFleetshardCertificateExpiringCritical exp_alerts: @@ -48,5 +48,5 @@ tests: severity: critical exp_annotations: summary: "Certificate expiring very soon: 'rhacs/secret/key`." - description: "Certificate 'rhacs/secret/key` expires on '1970-01-11'." + description: "Certificate 'rhacs/secret/key` expires on '1970-01-09T00:00:00Z'." From 9e6d0e7b1f21fdecdf1d89534a985e61f4eee40f Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Mon, 12 Aug 2024 13:51:04 +0200 Subject: [PATCH 07/16] updated time check --- .../unit_tests/RHACSFleetschardCertificateExpiring.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml index c03a5876..cf16ae2c 100644 --- a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml @@ -49,4 +49,3 @@ tests: exp_annotations: summary: "Certificate expiring very soon: 'rhacs/secret/key`." description: "Certificate 'rhacs/secret/key` expires on '1970-01-09T00:00:00Z'." - From fc4284ce5ae6dc530cfc2ca4ff8ff1a1407d1965 Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Mon, 12 Aug 2024 16:11:16 +0200 Subject: [PATCH 08/16] updating timestamp rule --- resources/prometheus/prometheus-rules.yaml | 4 ++-- .../unit_tests/RHACSFleetschardCertificateExpiring.yaml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index fd91b6ce..8219f7a5 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -135,7 +135,7 @@ spec: severity: critical annotations: summary: "Certificate expiring very soon: '{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." - description: "Certificate '{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on '{{ ($value | toTime).UTC.Format \"2006-01-02T15:04:05Z07:00\" }}'." + description: "Certificate '{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on '{{ humanizeTimestamp $value}}'." - alert: RHACSFleetshardCertificateExpiringSoon expr: | acs_fleetshard_certificate_expiration_timestamp <=7* 24 * 60 * 60 + time() @@ -143,7 +143,7 @@ spec: severity: warning annotations: summary: "Certificate Expiring soon in namespace `{{ $labels.namespace }}` for secret '{{ $labels.secret}}'." - description: "Certificate `{{ $labels.data_key }}` in namespace `{{ $labels.namespace }}` is expiring in less than 7 days, on '{{ ($value | toTime).UTC.Format \"2006-01-02T15:04:05Z07:00\" }}'." + description: "Certificate `{{ $labels.data_key }}` in namespace `{{ $labels.namespace }}` is expiring in less than 7 days, on '{{ humanizeTimestamp $value}}'." - alert: RHACSFleetshardSyncReconciliationErrors expr: | acs_fleetshard_central_errors_per_reconciliations:ratio_rate10m > 0.10 diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml index cf16ae2c..36ea1a4f 100644 --- a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml @@ -24,7 +24,7 @@ tests: severity: warning exp_annotations: summary: "Certificate Expiring soon in namespace `rhacs` for secret 'secret'." - description: "Certificate `key` in namespace `rhacs` is expiring in less than 7 days, on '1970-01-09T00:00:00Z'." + description: "Certificate `key` in namespace `rhacs` is expiring in less than 7 days, on '1970-01-09 00:00:00 +0000 UTC'." - eval_time: 7d alertname: RHACSFleetshardCertificateExpiringCritical exp_alerts: @@ -36,7 +36,7 @@ tests: severity: critical exp_annotations: summary: "Certificate expiring very soon: 'rhacs/secret/key`." - description: "Certificate 'rhacs/secret/key` expires on '1970-01-09T00:00:00Z'." + description: "Certificate 'rhacs/secret/key` expires on '1970-01-09 00:00:00 +0000 UTC'." - eval_time: 10d alertname: RHACSFleetshardCertificateExpiringCritical exp_alerts: @@ -48,4 +48,4 @@ tests: severity: critical exp_annotations: summary: "Certificate expiring very soon: 'rhacs/secret/key`." - description: "Certificate 'rhacs/secret/key` expires on '1970-01-09T00:00:00Z'." + description: "Certificate 'rhacs/secret/key` expires on '1970-01-09 00:00:00 +0000 UTC'." From 6a55e903a197424fc135b809d62ddc41abb313d9 Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Tue, 13 Aug 2024 11:28:35 +0200 Subject: [PATCH 09/16] minor changes for consistency --- resources/prometheus/prometheus-rules.yaml | 8 ++++---- .../RHACSFleetschardCertificateExpiring.yaml | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 8219f7a5..65c46ccd 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -134,16 +134,16 @@ spec: labels: severity: critical annotations: - summary: "Certificate expiring very soon: '{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." - description: "Certificate '{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on '{{ humanizeTimestamp $value}}'." + summary: "Certificate expiring very soon: `{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." + description: "Certificate `{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on {{ humanizeTimestamp $value}}." - alert: RHACSFleetshardCertificateExpiringSoon expr: | acs_fleetshard_certificate_expiration_timestamp <=7* 24 * 60 * 60 + time() labels: severity: warning annotations: - summary: "Certificate Expiring soon in namespace `{{ $labels.namespace }}` for secret '{{ $labels.secret}}'." - description: "Certificate `{{ $labels.data_key }}` in namespace `{{ $labels.namespace }}` is expiring in less than 7 days, on '{{ humanizeTimestamp $value}}'." + summary: "Certificate expiring soon: `{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." + description: "Certificate `{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on {{ humanizeTimestamp $value}}." - alert: RHACSFleetshardSyncReconciliationErrors expr: | acs_fleetshard_central_errors_per_reconciliations:ratio_rate10m > 0.10 diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml index 36ea1a4f..060284d4 100644 --- a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml @@ -23,8 +23,8 @@ tests: data_key: key severity: warning exp_annotations: - summary: "Certificate Expiring soon in namespace `rhacs` for secret 'secret'." - description: "Certificate `key` in namespace `rhacs` is expiring in less than 7 days, on '1970-01-09 00:00:00 +0000 UTC'." + summary: "Certificate expiring soon: `rhacs/secret/key`." + description: "Certificate `rhacs/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC." - eval_time: 7d alertname: RHACSFleetshardCertificateExpiringCritical exp_alerts: @@ -35,8 +35,8 @@ tests: data_key: key severity: critical exp_annotations: - summary: "Certificate expiring very soon: 'rhacs/secret/key`." - description: "Certificate 'rhacs/secret/key` expires on '1970-01-09 00:00:00 +0000 UTC'." + summary: "Certificate expiring very soon: `rhacs/secret/key`." + description: "Certificate `rhacs/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC." - eval_time: 10d alertname: RHACSFleetshardCertificateExpiringCritical exp_alerts: @@ -47,5 +47,5 @@ tests: data_key: key severity: critical exp_annotations: - summary: "Certificate expiring very soon: 'rhacs/secret/key`." - description: "Certificate 'rhacs/secret/key` expires on '1970-01-09 00:00:00 +0000 UTC'." + summary: "Certificate expiring very soon: `rhacs/secret/key`." + description: "Certificate `rhacs/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC." From 9c8990d6ffa484dab282f47ecbe39d8c148fb517 Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Thu, 22 Aug 2024 09:59:17 +0200 Subject: [PATCH 10/16] minor changes for consistency --- resources/prometheus/prometheus-rules.yaml | 8 ++++---- .../unit_tests/RHACSFleetschardCertificateExpiring.yaml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 65c46ccd..5bd27830 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -134,16 +134,16 @@ spec: labels: severity: critical annotations: - summary: "Certificate expiring very soon: `{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." - description: "Certificate `{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on {{ humanizeTimestamp $value}}." + summary: "Certificate expiring very soon: `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." + description: "Certificate `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on {{ humanizeTimestamp $value}}." - alert: RHACSFleetshardCertificateExpiringSoon expr: | acs_fleetshard_certificate_expiration_timestamp <=7* 24 * 60 * 60 + time() labels: severity: warning annotations: - summary: "Certificate expiring soon: `{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." - description: "Certificate `{{ $labels.namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on {{ humanizeTimestamp $value}}." + summary: "Certificate expiring soon: `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." + description: "Certificate `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on {{ humanizeTimestamp $value}}." - alert: RHACSFleetshardSyncReconciliationErrors expr: | acs_fleetshard_central_errors_per_reconciliations:ratio_rate10m > 0.10 diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml index 060284d4..20b1aaaa 100644 --- a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml @@ -6,7 +6,7 @@ evaluation_interval: 1m tests: - interval: 1d input_series: - - series: acs_fleetshard_certificate_expiration_timestamp{namespace="rhacs", secret="secret", data_key="key"} + - series: acs_fleetshard_certificate_expiration_timestamp{exported_namespace="rhacs", secret="secret", data_key="key"} values: "691200+0x15" # equals to 8 days alert_rule_test: @@ -18,7 +18,7 @@ tests: exp_alerts: - exp_labels: alertname: RHACSFleetshardCertificateExpiringSoon - namespace: rhacs + exported_namespace: rhacs secret: secret data_key: key severity: warning @@ -30,7 +30,7 @@ tests: exp_alerts: - exp_labels: alertname: RHACSFleetshardCertificateExpiringCritical - namespace: rhacs + exported_namespace: rhacs secret: secret data_key: key severity: critical @@ -42,7 +42,7 @@ tests: exp_alerts: - exp_labels: alertname: RHACSFleetshardCertificateExpiringCritical - namespace: rhacs + exported_namespace: rhacs secret: secret data_key: key severity: critical From 3d20adcc85e1267f6a7ce2700d89cd34886572da Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Thu, 22 Aug 2024 10:18:11 +0200 Subject: [PATCH 11/16] minor changes for consistency --- .../RHACSFleetschardCertificateExpiring.yaml | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml index 20b1aaaa..9a88c77f 100644 --- a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml @@ -6,7 +6,7 @@ evaluation_interval: 1m tests: - interval: 1d input_series: - - series: acs_fleetshard_certificate_expiration_timestamp{exported_namespace="rhacs", secret="secret", data_key="key"} + - series: acs_fleetshard_certificate_expiration_timestamp{exported_namespace="rhacs-00000000000000000000", secret="secret", data_key="key"} values: "691200+0x15" # equals to 8 days alert_rule_test: @@ -18,34 +18,34 @@ tests: exp_alerts: - exp_labels: alertname: RHACSFleetshardCertificateExpiringSoon - exported_namespace: rhacs + exported_namespace: rhacs-00000000000000000000 secret: secret data_key: key severity: warning exp_annotations: - summary: "Certificate expiring soon: `rhacs/secret/key`." - description: "Certificate `rhacs/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC." + summary: "Certificate expiring soon: `rhacs-00000000000000000000/secret/key`." + description: "Certificate `rhacs-00000000000000000000/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC." - eval_time: 7d alertname: RHACSFleetshardCertificateExpiringCritical exp_alerts: - exp_labels: alertname: RHACSFleetshardCertificateExpiringCritical - exported_namespace: rhacs + exported_namespace: rhacs-00000000000000000000 secret: secret data_key: key severity: critical exp_annotations: - summary: "Certificate expiring very soon: `rhacs/secret/key`." - description: "Certificate `rhacs/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC." + summary: "Certificate expiring very soon: `rhacs-00000000000000000000/secret/key`." + description: "Certificate `rhacs-00000000000000000000/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC." - eval_time: 10d alertname: RHACSFleetshardCertificateExpiringCritical exp_alerts: - exp_labels: alertname: RHACSFleetshardCertificateExpiringCritical - exported_namespace: rhacs + exported_namespace: rhacs-00000000000000000000 secret: secret data_key: key severity: critical exp_annotations: - summary: "Certificate expiring very soon: `rhacs/secret/key`." - description: "Certificate `rhacs/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC." + summary: "Certificate expiring very soon: `rhacs-00000000000000000000/secret/key`." + description: "Certificate `rhacs-00000000000000000000/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC." From a97705fcafcf79dd432ef91fc29b3e06efb5e13e Mon Sep 17 00:00:00 2001 From: aaa5kameric Date: Mon, 26 Aug 2024 12:08:41 +0200 Subject: [PATCH 12/16] Update from suggestions --- resources/prometheus/prometheus-rules.yaml | 8 ++++---- .../RHACSFleetschardCertificateExpiring.yaml | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 5bd27830..65212316 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -128,17 +128,17 @@ spec: summary: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times." description: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-005-fleetshard-sync-unavailable.md" - - alert: RHACSFleetshardCertificateExpiringCritical + - alert: RHACSFleetshardCertificateExpiryCritical expr: | - acs_fleetshard_certificate_expiration_timestamp <=1 * 24 * 60 * 60 + time() + acs_fleetshard_certificate_expiration_timestamp <= 1 * 24 * 60 * 60 + time() labels: severity: critical annotations: summary: "Certificate expiring very soon: `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`." description: "Certificate `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on {{ humanizeTimestamp $value}}." - - alert: RHACSFleetshardCertificateExpiringSoon + - alert: RHACSFleetshardCertificateExpiryWarning expr: | - acs_fleetshard_certificate_expiration_timestamp <=7* 24 * 60 * 60 + time() + acs_fleetshard_certificate_expiration_timestamp <= 7* 24 * 60 * 60 + time() labels: severity: warning annotations: diff --git a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml index 9a88c77f..57516c50 100644 --- a/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetschardCertificateExpiring.yaml @@ -11,13 +11,13 @@ tests: alert_rule_test: - eval_time: 0 - alertname: RHACSFleetshardCertificateExpiringSoon + alertname: RHACSFleetshardCertificateExpiryWarning exp_alerts: [ ] - eval_time: 3d - alertname: RHACSFleetshardCertificateExpiringSoon + alertname: RHACSFleetshardCertificateExpiryWarning exp_alerts: - exp_labels: - alertname: RHACSFleetshardCertificateExpiringSoon + alertname: RHACSFleetshardCertificateExpiryWarning exported_namespace: rhacs-00000000000000000000 secret: secret data_key: key @@ -26,10 +26,10 @@ tests: summary: "Certificate expiring soon: `rhacs-00000000000000000000/secret/key`." description: "Certificate `rhacs-00000000000000000000/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC." - eval_time: 7d - alertname: RHACSFleetshardCertificateExpiringCritical + alertname: RHACSFleetshardCertificateExpiryCritical exp_alerts: - exp_labels: - alertname: RHACSFleetshardCertificateExpiringCritical + alertname: RHACSFleetshardCertificateExpiryCritical exported_namespace: rhacs-00000000000000000000 secret: secret data_key: key @@ -38,10 +38,10 @@ tests: summary: "Certificate expiring very soon: `rhacs-00000000000000000000/secret/key`." description: "Certificate `rhacs-00000000000000000000/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC." - eval_time: 10d - alertname: RHACSFleetshardCertificateExpiringCritical + alertname: RHACSFleetshardCertificateExpiryCritical exp_alerts: - exp_labels: - alertname: RHACSFleetshardCertificateExpiringCritical + alertname: RHACSFleetshardCertificateExpiryCritical exported_namespace: rhacs-00000000000000000000 secret: secret data_key: key From f1e34afb06608d465430009998923f5c4506ab37 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 24 Sep 2024 02:28:03 +0200 Subject: [PATCH 13/16] chore(deps): bump pascalgn/automerge-action from 0.16.3 to 0.16.4 (#280) Bumps [pascalgn/automerge-action](https://github.com/pascalgn/automerge-action) from 0.16.3 to 0.16.4. - [Release notes](https://github.com/pascalgn/automerge-action/releases) - [Commits](https://github.com/pascalgn/automerge-action/compare/v0.16.3...v0.16.4) --- updated-dependencies: - dependency-name: pascalgn/automerge-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/automerge.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/automerge.yaml b/.github/workflows/automerge.yaml index 9d79c96d..fcc9e7a6 100644 --- a/.github/workflows/automerge.yaml +++ b/.github/workflows/automerge.yaml @@ -26,7 +26,7 @@ jobs: steps: - id: automerge name: automerge - uses: "pascalgn/automerge-action@v0.16.3" + uses: "pascalgn/automerge-action@v0.16.4" env: GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" MERGE_METHOD: merge From eea36832e85f70f4a2f44cf6cd807b0ca0d62e34 Mon Sep 17 00:00:00 2001 From: Aleksandr Kurlov Date: Fri, 11 Oct 2024 17:05:05 +0200 Subject: [PATCH 14/16] setup python for pre commit (#282) --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3bde3631..fc339fba 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -10,6 +10,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 + with: + python-version: '3.12' - uses: actions/setup-go@v5 with: go-version: ">=1.18.0" From e0167fece4c53ba09c5cf2ba5259defab137d232 Mon Sep 17 00:00:00 2001 From: Aleksandr Kurlov Date: Fri, 11 Oct 2024 17:19:08 +0200 Subject: [PATCH 15/16] Increase RHACSTenantWorkloadMemoryUtilizationHigh eval time to 30 minutes (#281) --- resources/prometheus/prometheus-rules.yaml | 2 +- .../RHACSTenantWorkloadMemoryUtilizationHigh.yaml | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 65212316..22bed52e 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -242,7 +242,7 @@ spec: - alert: RHACSTenantWorkloadMemoryUtilizationHigh expr: | rhacs_tenants:namespace:pod:container:max_memory_usage_ratio{container="central"} >= 0.85 - for: 10m + for: 30m labels: severity: warning annotations: diff --git a/resources/prometheus/unit_tests/RHACSTenantWorkloadMemoryUtilizationHigh.yaml b/resources/prometheus/unit_tests/RHACSTenantWorkloadMemoryUtilizationHigh.yaml index 7a30b181..58c64897 100644 --- a/resources/prometheus/unit_tests/RHACSTenantWorkloadMemoryUtilizationHigh.yaml +++ b/resources/prometheus/unit_tests/RHACSTenantWorkloadMemoryUtilizationHigh.yaml @@ -7,14 +7,15 @@ tests: - interval: 1m input_series: - series: container_memory_working_set_bytes{namespace="rhacs-aaaaaaaaaaaaaaaaaaaa", pod="mypod", container="central"} - values: "50+0x10 85+0x10" + # first 10 minutes no alert and then 85% CPU usage for 40 minutes + values: "50+0x10 85+0x40" - series: container_spec_memory_limit_bytes{namespace="rhacs-aaaaaaaaaaaaaaaaaaaa",pod="mypod", container="central"} - values: "100+0x20" + values: "100+0x40" alert_rule_test: - eval_time: 1m alertname: RHACSTenantWorkloadMemoryUtilizationHigh exp_alerts: [] - - eval_time: 21m + - eval_time: 41m alertname: RHACSTenantWorkloadMemoryUtilizationHigh exp_alerts: - exp_labels: From afe0e28b4fb5dd3c62fd6c24d2aa9bc55d4e1aa8 Mon Sep 17 00:00:00 2001 From: Ludovic Cleroux Date: Tue, 29 Oct 2024 12:38:45 +0100 Subject: [PATCH 16/16] Increase tenant workload memory alert threshold to 90% --- resources/prometheus/prometheus-rules.yaml | 2 +- .../RHACSTenantWorkloadMemoryUtilizationHigh.yaml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 22bed52e..98eef14c 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -241,7 +241,7 @@ spec: record: rhacs_tenants:namespace:pod:container:max_memory_usage_ratio - alert: RHACSTenantWorkloadMemoryUtilizationHigh expr: | - rhacs_tenants:namespace:pod:container:max_memory_usage_ratio{container="central"} >= 0.85 + rhacs_tenants:namespace:pod:container:max_memory_usage_ratio{container="central"} >= 0.9 for: 30m labels: severity: warning diff --git a/resources/prometheus/unit_tests/RHACSTenantWorkloadMemoryUtilizationHigh.yaml b/resources/prometheus/unit_tests/RHACSTenantWorkloadMemoryUtilizationHigh.yaml index 58c64897..7ccf1f82 100644 --- a/resources/prometheus/unit_tests/RHACSTenantWorkloadMemoryUtilizationHigh.yaml +++ b/resources/prometheus/unit_tests/RHACSTenantWorkloadMemoryUtilizationHigh.yaml @@ -7,8 +7,8 @@ tests: - interval: 1m input_series: - series: container_memory_working_set_bytes{namespace="rhacs-aaaaaaaaaaaaaaaaaaaa", pod="mypod", container="central"} - # first 10 minutes no alert and then 85% CPU usage for 40 minutes - values: "50+0x10 85+0x40" + # first 10 minutes no alert and then 90% CPU usage for 40 minutes + values: "50+0x10 90+0x40" - series: container_spec_memory_limit_bytes{namespace="rhacs-aaaaaaaaaaaaaaaaaaaa",pod="mypod", container="central"} values: "100+0x40" alert_rule_test: @@ -26,7 +26,7 @@ tests: container: central exp_annotations: summary: tenant 'rhacs-aaaaaaaaaaaaaaaaaaaa' container 'central' in pod 'mypod' is reaching its memory limit. - description: tenant 'rhacs-aaaaaaaaaaaaaaaaaaaa' container 'central' in pod 'mypod' reached 85% of its memory limit and is at risk of being OOM killed. + description: tenant 'rhacs-aaaaaaaaaaaaaaaaaaaa' container 'central' in pod 'mypod' reached 90% of its memory limit and is at risk of being OOM killed. sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-039-tenant-workload-memory-utilization-high.md" - interval: 1m input_series: