Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROX-25003: Add emailsender alerts #272

Merged
merged 3 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,59 @@ spec:
description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md"

- name: rhacs-emailsender
rules:
- alert: RHACSEmailsenderScrapeFailed
expr: |
(avg_over_time(up{pod=~"emailsender-.*"}[10m]) < 0.5 and ON(pod) kube_pod_container_status_ready{pod=~"emailsender-.*"} == 1) or absent(up{pod=~"emailsender-.*"})
for: 20m
labels:
severity: warning
annotations:
summary: "Prometheus unable to scrape metrics from target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}`."
description: "During the last 10 minutes, only `{{ $value | humanizePercentage }}` of scrapes of target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}` were successful. This alert is raised when less than 50% of scrapes are successful."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
- alert: RHACSEmailsenderContainerDown
expr: |
avg_over_time(kube_pod_container_status_ready{pod=~"emailsender-.*"}[10m]) < 0.5
for: 20m
labels:
severity: warning
annotations:
summary: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` is down or in a CrashLoopBackOff status."
description: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has been down or in a CrashLoopBackOff status for at least 10 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
- alert: RHACSEmailsenderContainerFrequentlyRestarting
expr: increase(kube_pod_container_status_restarts_total{pod=~"emailsender-.*"}[30m]) > 3
labels:
severity: warning
annotations:
summary: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times."
description: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
- alert: RHACSEmailsenderSendErrors
expr: |2
(rate(acs_emailsender_failed_send_email_total[10m])
/
rate(acs_emailsender_send_email_total[10m])) > 0.10
for: 5m
labels:
severity: warning
annotations:
summary: "Email Sender container failing sending emails"
description: "Email Sender has a send email error rate of {{ $value | humanizePercentage }} over the last 10 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-047-emailsender-ses-send-error.md"
- alert: RHACSEmailsenderThrottledSend
expr: |
acs_emailsender_throttled_send_email_total > 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a rate missing here? If this is a counter, it will always fire otherwise.

for: 5m
labels:
severity: warning
annotations:
summary: "Email Sender throttled sending for `{{ $labels.tenant_id }}` Central instance"
description: "Email Sender is throttled {{ $value }} times for `{{ $labels.tenant_id }}` Central"
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-048-emailsender-ses-send-throttled.md"

- name: tenant-resources
rules:
- expr: |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: kube_pod_container_status_ready{namespace="rhacs", pod="emailsender-123", container="emailsender"}
values: "1+0x10 0+0x50"
alert_rule_test:
- eval_time: 15m
alertname: RHACSEmailsenderContainerDown
exp_alerts: []
- eval_time: 40m
alertname: RHACSEmailsenderContainerDown
exp_alerts:
- exp_labels:
alertname: RHACSEmailsenderContainerDown
container: emailsender
namespace: rhacs
pod: emailsender-123
severity: warning
exp_annotations:
summary: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` is down or in a CrashLoopBackOff status."
description: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` has been down or in a CrashLoopBackOff status for at least 10 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: kube_pod_container_status_restarts_total{namespace="rhacs", pod="emailsender-123", container="emailsender"}
values: "0+0x30 1+1x10 4+1x20"
alert_rule_test:
- eval_time: 30m
alertname: RHACSEmailsenderContainerFrequentlyRestarting
exp_alerts: []
- eval_time: 60m
alertname: RHACSEmailsenderContainerFrequentlyRestarting
exp_alerts:
- exp_labels:
alertname: RHACSEmailsenderContainerFrequentlyRestarting
container: emailsender
namespace: rhacs
pod: emailsender-123
severity: warning
exp_annotations:
summary: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` restarted more than 3 times."
description: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` has restarted more than 3 times during the last 30 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: up{namespace="rhacs", pod="emailsender-123", instance="1.2.3.4:9090"}
values: "0+0x20 1+0x20"
- series: kube_pod_container_status_ready{namespace="rhacs", pod="emailsender-123"}
values: "1+0x40"
alert_rule_test:
- eval_time: 10m
alertname: RHACSEmailsenderScrapeFailed
exp_alerts: []
- eval_time: 25m
alertname: RHACSEmailsenderScrapeFailed
exp_alerts:
- exp_labels:
alertname: RHACSEmailsenderScrapeFailed
instance: 1.2.3.4:9090
namespace: rhacs
pod: emailsender-123
severity: warning
exp_annotations:
summary: "Prometheus unable to scrape metrics from target `emailsender-123` in namespace `rhacs`."
description: "During the last 10 minutes, only `45.45%` of scrapes of target `emailsender-123` in namespace `rhacs` were successful. This alert is raised when less than 50% of scrapes are successful."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md"
29 changes: 29 additions & 0 deletions resources/prometheus/unit_tests/RHACSEmailsenderSendErrors.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: acs_emailsender_failed_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender"}
values: "0+0x10 1+1x50"
- series: acs_emailsender_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender"}
values: "1+1x10 1+2x50"
alert_rule_test:
- eval_time: 15m
alertname: RHACSEmailsenderSendErrors
exp_alerts: []
- eval_time: 40m
alertname: RHACSEmailsenderSendErrors
exp_alerts:
- exp_labels:
alertname: RHACSEmailsenderSendErrors
container: emailsender
namespace: rhacs
pod: emailsender-123
severity: warning
exp_annotations:
summary: "Email Sender container failing sending emails"
description: "Email Sender has a send email error rate of 50% over the last 10 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-047-emailsender-ses-send-error.md"
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: acs_emailsender_throttled_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender", tenant_id="centralid"}
values: "0+0x10 2+0x50"
alert_rule_test:
- eval_time: 15m
alertname: RHACSEmailsenderThrottledSend
exp_alerts: []
- eval_time: 40m
alertname: RHACSEmailsenderThrottledSend
exp_alerts:
- exp_labels:
alertname: RHACSEmailsenderThrottledSend
container: emailsender
namespace: rhacs
pod: emailsender-123
tenant_id: centralid
severity: warning
exp_annotations:
summary: "Email Sender throttled sending for `centralid` Central instance"
description: "Email Sender is throttled 2 times for `centralid` Central"
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-048-emailsender-ses-send-throttled.md"
Loading