From 615be42850e373edf394ff9b3282bb3fd935a42b Mon Sep 17 00:00:00 2001 From: Kyle Lape Date: Fri, 16 Jun 2023 12:31:19 -0500 Subject: [PATCH] ROX-17906: Create alerts for egress proxy availability Two alerts: * RHACSEgressProxyReplicaCount: *Critical* alert if replica count is < 3 for >20m * RHACSEgressProxyContainerFrequentlyRestarting: *Warning* alert if container restarts > 3 times in 30m --- resources/prometheus/prometheus-rules.yaml | 20 +++++++ .../unit_tests/RHACSEgressProxy.yaml | 52 +++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 resources/prometheus/unit_tests/RHACSEgressProxy.yaml diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 1b3b14d0..e1ba80f0 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -97,6 +97,26 @@ spec: description: "Scanner container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-003-rhacs-instance-unavailable.md" + - name: rhacs-egress-proxy + rules: + - alert: RHACSEgressProxyReplicaCount + expr: kube_deployment_status_replicas_ready{namespace=~"rhacs-.*",deployment="egress-proxy"} < 3 + for: 20m + labels: + severity: critical + annotations: + summary: "Egress proxy cannot reach desired replica count (3) in namespace `{{ $labels.namespace }}`." + description: "During the last 30 minutes, the egress-proxy deployment in namespace `{{ $labels.namespace }}` has not reached three (3) replicas. This alert is raised when at least one replica is continuously marked as not ready for at least 20 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-003-rhacs-instance-unavailable.md" + - alert: RHACSEgressProxyContainerFrequentlyRestarting + expr: increase(kube_pod_container_status_restarts_total{namespace=~"rhacs-.*",container="egress-proxy"}[30m]) > 3 + labels: + severity: warning + annotations: + summary: "Egress proxy container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times." + description: "Egress proxy container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-003-rhacs-instance-unavailable.md" + - name: rhacs-fleetshard rules: - alert: RHACSFleetshardOperatorContainerDown diff --git a/resources/prometheus/unit_tests/RHACSEgressProxy.yaml b/resources/prometheus/unit_tests/RHACSEgressProxy.yaml new file mode 100644 index 00000000..449f87a8 --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSEgressProxy.yaml @@ -0,0 +1,52 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: kube_deployment_status_replicas_ready{namespace="rhacs-aaaabbbbccccddddeeee", deployment="egress-proxy", container="kube-rbac-proxy-main"} + values: "3x10 2x10 1x20" + alert_rule_test: + - eval_time: 10m + alertname: RHACSEgressProxyReplicaCount + exp_alerts: [] + - eval_time: 20m + alertname: RHACSEgressProxyReplicaCount + exp_alerts: [] + - eval_time: 40m + alertname: RHACSEgressProxyReplicaCount + exp_alerts: + - exp_labels: + alertname: RHACSEgressProxyReplicaCount + namespace: rhacs-aaaabbbbccccddddeeee + deployment: egress-proxy + # not sure why the observed metrics have container=kube-rbac-proxy-main + container: kube-rbac-proxy-main + severity: critical + exp_annotations: + summary: "Egress proxy cannot reach desired replica count (3) in namespace `rhacs-aaaabbbbccccddddeeee`." + description: "During the last 30 minutes, the egress-proxy deployment in namespace `rhacs-aaaabbbbccccddddeeee` has not reached three (3) replicas. This alert is raised when at least one replica is continuously marked as not ready for at least 20 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-003-rhacs-instance-unavailable.md" + - interval: 1m + input_series: + - series: kube_pod_container_status_restarts_total{namespace="rhacs-aaaabbbbccccddddeeee", pod="egress-proxy-1234-5678", container="egress-proxy"} + values: "0+0x10 1+1x10 10+1x20" + alert_rule_test: + - eval_time: 10m + alertname: RHACSEgressProxyContainerFrequentlyRestarting + exp_alerts: [] + - eval_time: 30m + alertname: RHACSEgressProxyContainerFrequentlyRestarting + exp_alerts: + - exp_labels: + alertname: RHACSEgressProxyContainerFrequentlyRestarting + container: egress-proxy + namespace: rhacs-aaaabbbbccccddddeeee + pod: egress-proxy-1234-5678 + severity: warning + exp_annotations: + summary: "Egress proxy container `egress-proxy-1234-5678/egress-proxy` in namespace `rhacs-aaaabbbbccccddddeeee` restarted more than 3 times." + description: "Egress proxy container `egress-proxy-1234-5678/egress-proxy` in namespace `rhacs-aaaabbbbccccddddeeee` has restarted more than 3 times during the last 30 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-003-rhacs-instance-unavailable.md"