Update upstream alerts

appuio · Nov 19, 2024 · 5982ffd · 5982ffd
1 parent 85f5517
commit 5982ffd
Show file tree

Hide file tree

Showing 10 changed files with 173 additions and 81 deletions.
diff --git a/component/extracted_alerts/master/collector_prometheus_alerts.yaml b/component/extracted_alerts/master/collector_prometheus_alerts.yaml
@@ -9,46 +9,18 @@ spec:
     rules:
     - alert: CollectorNodeDown
       annotations:
-        message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
+        description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
         summary: "Collector cannot be scraped"
       expr: |
         up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0
       for: 10m
       labels:
         service: collector
         severity: critical
-    - alert: CollectorHighErrorRate
-      annotations:
-        message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
-        summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high"
-      expr: |
-        100 * (
-            collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-          /
-            collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-          ) > 0.001
-      for: 15m
-      labels:
-        service: collector
-        severity: critical
-    - alert: CollectorVeryHighErrorRate
-      annotations:
-        message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
-        summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high"
-      expr: |
-        100 * (
-            collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-          /
-            collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-          ) > 0.05
-      for: 15m
-      labels:
-        service: collector
-        severity: critical
     - alert: ElasticsearchDeprecation
       annotations:
-        message: "In Red Hat OpenShift Logging Operator 6.0, support for the Red Hat Elasticsearch Operator has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead."
-        summary: "Detected Elasticsearch as the in-cluster storage, which has been removed in 6.0 release"
+        description: "In Red Hat OpenShift Logging Operator 6.0, support for the Red Hat Elasticsearch Operator has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead."
+        summary: "Detected Elasticsearch as the in-cluster storage, which has been removed in the 6.0 release"
       expr: |
         sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0
       for: 5m
@@ -58,8 +30,8 @@ spec:
         namespace: openshift-logging
     - alert: FluentdDeprecation
       annotations:
-        message: "In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead."
-        summary: "Detected Fluentd as the collector, which has been removed in a 6.0 release"
+        description: "In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead."
+        summary: "Detected Fluentd as the collector, which has been removed in the 6.0 release"
       expr: |
         sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0
       for: 5m
@@ -69,7 +41,7 @@ spec:
         namespace: openshift-logging
     - alert: KibanaDeprecation
       annotations:
-        message: "In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead."
+        description: "In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead."
         summary: "Detected Kibana as the log data visualization, which has been removed in the 6.0 release"
       expr: |
         sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0
@@ -80,7 +52,7 @@ spec:
         namespace: openshift-logging
     - alert: DiskBufferUsage
       annotations:
-        message: "Collectors potentially consuming too much node disk, {{ $value }}% "
+        description: "Collectors potentially consuming too much node disk, {{ $value }}% "
         summary: "Detected consuming too much node disk on $labels.hostname host"
       expr: |
         (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') 

diff --git a/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml
@@ -175,6 +175,24 @@ groups:
     for: 15m
     labels:
       severity: warning
+  - alert: LokiDiscardedSamplesWarning
+    annotations:
+      message: |-
+        Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+        Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+      summary: Loki is discarding samples during ingestion because they fail validation.
+      runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
+    expr: |
+      sum by(namespace, tenant, reason) (
+        irate(loki_discarded_samples_total{
+          reason!="rate_limited",
+          reason!="per_stream_rate_limit",
+          reason!="stream_limit"}[2m])
+      )
+      > 0
+    for: 15m
+    labels:
+      severity: warning
   - alert: LokistackSchemaUpgradesRequired
     annotations:
       message: |-

diff --git a/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml
@@ -175,3 +175,21 @@ groups:
     for: 15m
     labels:
       severity: warning
+  - alert: LokiDiscardedSamplesWarning
+    annotations:
+      message: |-
+        Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+        Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+      summary: Loki is discarding samples during ingestion because they fail validation.
+      runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
+    expr: |
+      sum by(namespace, tenant, reason) (
+        irate(loki_discarded_samples_total{
+          reason!="rate_limited",
+          reason!="per_stream_rate_limit",
+          reason!="stream_limit"}[2m])
+      )
+      > 0
+    for: 15m
+    labels:
+      severity: warning
diff --git a/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml
@@ -175,3 +175,21 @@ groups:
     for: 15m
     labels:
       severity: warning
+  - alert: LokiDiscardedSamplesWarning
+    annotations:
+      message: |-
+        Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+        Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+      summary: Loki is discarding samples during ingestion because they fail validation.
+      runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
+    expr: |
+      sum by(namespace, tenant, reason) (
+        irate(loki_discarded_samples_total{
+          reason!="rate_limited",
+          reason!="per_stream_rate_limit",
+          reason!="stream_limit"}[2m])
+      )
+      > 0
+    for: 15m
+    labels:
+      severity: warning
diff --git a/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml
@@ -175,6 +175,24 @@ groups:
     for: 15m
     labels:
       severity: warning
+  - alert: LokiDiscardedSamplesWarning
+    annotations:
+      message: |-
+        Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+        Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+      summary: Loki is discarding samples during ingestion because they fail validation.
+      runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
+    expr: |
+      sum by(namespace, tenant, reason) (
+        irate(loki_discarded_samples_total{
+          reason!="rate_limited",
+          reason!="per_stream_rate_limit",
+          reason!="stream_limit"}[2m])
+      )
+      > 0
+    for: 15m
+    labels:
+      severity: warning
   - alert: LokistackSchemaUpgradesRequired
     annotations:
       message: |-

diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml
@@ -204,6 +204,27 @@ spec:
             severity: warning
             syn: 'true'
             syn_component: openshift4-logging
+        - alert: SYN_LokiDiscardedSamplesWarning
+          annotations:
+            message: |-
+              Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+              Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+            runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
+            summary: Loki is discarding samples during ingestion because they fail
+              validation.
+          expr: |
+            sum by(namespace, tenant, reason) (
+              irate(loki_discarded_samples_total{
+                reason!="rate_limited",
+                reason!="per_stream_rate_limit",
+                reason!="stream_limit"}[2m])
+            )
+            > 0
+          for: 15m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-logging
         - alert: SYN_LokistackSchemaUpgradesRequired
           annotations:
             message: |-

diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml
@@ -204,6 +204,27 @@ spec:
             severity: warning
             syn: 'true'
             syn_component: openshift4-logging
+        - alert: SYN_LokiDiscardedSamplesWarning
+          annotations:
+            message: |-
+              Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+              Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+            runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
+            summary: Loki is discarding samples during ingestion because they fail
+              validation.
+          expr: |
+            sum by(namespace, tenant, reason) (
+              irate(loki_discarded_samples_total{
+                reason!="rate_limited",
+                reason!="per_stream_rate_limit",
+                reason!="stream_limit"}[2m])
+            )
+            > 0
+          for: 15m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-logging
         - alert: SYN_LokistackSchemaUpgradesRequired
           annotations:
             message: |-

diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml
@@ -12,7 +12,7 @@ spec:
       rules:
         - alert: SYN_CollectorNodeDown
           annotations:
-            message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod
+            description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod
               }} collector component for more than 10m.
             summary: Collector cannot be scraped
           expr: |
@@ -23,50 +23,14 @@ spec:
             severity: critical
             syn: 'true'
             syn_component: openshift4-logging
-        - alert: SYN_CollectorHighErrorRate
-          annotations:
-            message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace
-              }}/{{ $labels.pod }} collector component.'
-            summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component
-              errors are high'
-          expr: |
-            100 * (
-                collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-              /
-                collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-              ) > 0.001
-          for: 15m
-          labels:
-            service: collector
-            severity: critical
-            syn: 'true'
-            syn_component: openshift4-logging
-        - alert: SYN_CollectorVeryHighErrorRate
-          annotations:
-            message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace
-              }}/{{ $labels.pod }} collector component.'
-            summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component
-              errors are very high'
-          expr: |
-            100 * (
-                collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-              /
-                collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-              ) > 0.05
-          for: 15m
-          labels:
-            service: collector
-            severity: critical
-            syn: 'true'
-            syn_component: openshift4-logging
         - alert: SYN_ElasticsearchDeprecation
           annotations:
-            message: In Red Hat OpenShift Logging Operator 6.0, support for the Red
-              Hat Elasticsearch Operator has been removed. Bug fixes and support are
-              provided only through the end of the 5.9 lifecycle. As an alternative
+            description: In Red Hat OpenShift Logging Operator 6.0, support for the
+              Red Hat Elasticsearch Operator has been removed. Bug fixes and support
+              are provided only through the end of the 5.9 lifecycle. As an alternative
               to the Elasticsearch Operator, you can use the Loki Operator instead.
             summary: Detected Elasticsearch as the in-cluster storage, which has been
-              removed in 6.0 release
+              removed in the 6.0 release
           expr: |
             sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0
           for: 5m
@@ -78,12 +42,12 @@ spec:
             syn_component: openshift4-logging
         - alert: SYN_FluentdDeprecation
           annotations:
-            message: In Red Hat OpenShift Logging Operator 6.0, support for Fluentd
+            description: In Red Hat OpenShift Logging Operator 6.0, support for Fluentd
               as a collector has been removed. Bug fixes and support are provided
               only through the end of the 5.9 lifecycle. As an alternative to Fluentd,
               you can use the Vector collector instead.
             summary: Detected Fluentd as the collector, which has been removed in
-              a 6.0 release
+              the 6.0 release
           expr: |
             sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0
           for: 5m
@@ -95,7 +59,7 @@ spec:
             syn_component: openshift4-logging
         - alert: SYN_KibanaDeprecation
           annotations:
-            message: In Red Hat OpenShift Logging Operator 6.0, support for Kibana
+            description: In Red Hat OpenShift Logging Operator 6.0, support for Kibana
               as a data visualization dashboard has been removed. Bug fixes and support
               are provided only through the end of the 5.9 lifecycle. As an alternative
               to Kibana, you can use the Grafana Dashboard instead.
@@ -112,8 +76,8 @@ spec:
             syn_component: openshift4-logging
         - alert: SYN_DiskBufferUsage
           annotations:
-            message: 'Collectors potentially consuming too much node disk, {{ $value
-              }}% '
+            description: 'Collectors potentially consuming too much node disk, {{
+              $value }}% '
             summary: Detected consuming too much node disk on $labels.hostname host
           expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\
             \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\

diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml
@@ -204,6 +204,27 @@ spec:
             severity: warning
             syn: 'true'
             syn_component: openshift4-logging
+        - alert: SYN_LokiDiscardedSamplesWarning
+          annotations:
+            message: |-
+              Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+              Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+            runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
+            summary: Loki is discarding samples during ingestion because they fail
+              validation.
+          expr: |
+            sum by(namespace, tenant, reason) (
+              irate(loki_discarded_samples_total{
+                reason!="rate_limited",
+                reason!="per_stream_rate_limit",
+                reason!="stream_limit"}[2m])
+            )
+            > 0
+          for: 15m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-logging
         - alert: SYN_LokistackSchemaUpgradesRequired
           annotations:
             message: |-

diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml
@@ -204,6 +204,27 @@ spec:
             severity: warning
             syn: 'true'
             syn_component: openshift4-logging
+        - alert: SYN_LokiDiscardedSamplesWarning
+          annotations:
+            message: |-
+              Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+              Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+            runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
+            summary: Loki is discarding samples during ingestion because they fail
+              validation.
+          expr: |
+            sum by(namespace, tenant, reason) (
+              irate(loki_discarded_samples_total{
+                reason!="rate_limited",
+                reason!="per_stream_rate_limit",
+                reason!="stream_limit"}[2m])
+            )
+            > 0
+          for: 15m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-logging
         - alert: SYN_LokistackSchemaUpgradesRequired
           annotations:
             message: |-