From d7bd2222c0a842dce8f07eb30b73173c11d7a8d5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 19 Dec 2024 10:15:19 +0000 Subject: [PATCH] Update upstream alerts --- .../master/collector_prometheus_alerts.yaml | 65 +------------ .../master/lokistack_prometheus_alerts.yaml | 18 ++++ .../lokistack_prometheus_alerts.yaml | 18 ++++ .../lokistack_prometheus_alerts.yaml | 18 ++++ .../lokistack_prometheus_alerts.yaml | 18 ++++ .../60_lokistack_alerts.yaml | 21 +++++ .../60_lokistack_alerts.yaml | 21 +++++ .../60_collector_alerts.yaml | 93 +------------------ .../60_lokistack_alerts.yaml | 21 +++++ .../60_lokistack_alerts.yaml | 21 +++++ 10 files changed, 161 insertions(+), 153 deletions(-) diff --git a/component/extracted_alerts/master/collector_prometheus_alerts.yaml b/component/extracted_alerts/master/collector_prometheus_alerts.yaml index 1942d35..2d5cdf8 100644 --- a/component/extracted_alerts/master/collector_prometheus_alerts.yaml +++ b/component/extracted_alerts/master/collector_prometheus_alerts.yaml @@ -9,7 +9,7 @@ spec: rules: - alert: CollectorNodeDown annotations: - message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." + description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." summary: "Collector cannot be scraped" expr: | up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 @@ -17,70 +17,9 @@ spec: labels: service: collector severity: critical - - alert: CollectorHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - - alert: CollectorVeryHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - - alert: ElasticsearchDeprecation - annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for the Red Hat Elasticsearch Operator has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead." - summary: "Detected Elasticsearch as the in-cluster storage, which has been removed in 6.0 release" - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - service: storage - severity: Warning - namespace: openshift-logging - - alert: FluentdDeprecation - annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead." - summary: "Detected Fluentd as the collector, which has been removed in a 6.0 release" - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - service: collector - severity: Warning - namespace: openshift-logging - - alert: KibanaDeprecation - annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead." - summary: "Detected Kibana as the log data visualization, which has been removed in the 6.0 release" - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - service: visualization - severity: Warning - namespace: openshift-logging - alert: DiskBufferUsage annotations: - message: "Collectors potentially consuming too much node disk, {{ $value }}% " + description: "Collectors potentially consuming too much node disk, {{ $value }}% " summary: "Detected consuming too much node disk on $labels.hostname host" expr: | (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') diff --git a/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml index 15cc424..799c280 100644 --- a/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml @@ -175,6 +175,24 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning - alert: LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml index f378c49..e0c49d6 100644 --- a/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml @@ -175,3 +175,21 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning diff --git a/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml index f378c49..e0c49d6 100644 --- a/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml @@ -175,3 +175,21 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning diff --git a/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml index 15cc424..799c280 100644 --- a/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml @@ -175,6 +175,24 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning - alert: LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..4f6c7da 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -204,6 +204,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..4f6c7da 100644 --- a/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -204,6 +204,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml index 19adca5..2c6ddb1 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -12,7 +12,7 @@ spec: rules: - alert: SYN_CollectorNodeDown annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m. summary: Collector cannot be scraped expr: | @@ -23,97 +23,10 @@ spec: severity: critical syn: 'true' syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchDeprecation - annotations: - message: In Red Hat OpenShift Logging Operator 6.0, support for the Red - Hat Elasticsearch Operator has been removed. Bug fixes and support are - provided only through the end of the 5.9 lifecycle. As an alternative - to the Elasticsearch Operator, you can use the Loki Operator instead. - summary: Detected Elasticsearch as the in-cluster storage, which has been - removed in 6.0 release - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: storage - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentdDeprecation - annotations: - message: In Red Hat OpenShift Logging Operator 6.0, support for Fluentd - as a collector has been removed. Bug fixes and support are provided - only through the end of the 5.9 lifecycle. As an alternative to Fluentd, - you can use the Vector collector instead. - summary: Detected Fluentd as the collector, which has been removed in - a 6.0 release - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_KibanaDeprecation - annotations: - message: In Red Hat OpenShift Logging Operator 6.0, support for Kibana - as a data visualization dashboard has been removed. Bug fixes and support - are provided only through the end of the 5.9 lifecycle. As an alternative - to Kibana, you can use the Grafana Dashboard instead. - summary: Detected Kibana as the log data visualization, which has been - removed in the 6.0 release - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: visualization - severity: Warning - syn: 'true' - syn_component: openshift4-logging - alert: SYN_DiskBufferUsage annotations: - message: 'Collectors potentially consuming too much node disk, {{ $value - }}% ' + description: 'Collectors potentially consuming too much node disk, {{ + $value }}% ' summary: Detected consuming too much node disk on $labels.hostname host expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\ \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\ diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..4f6c7da 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -204,6 +204,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..4f6c7da 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -204,6 +204,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |-