Skip to content

Commit

Permalink
Update upstream alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
github-actions[bot] committed Nov 19, 2024
1 parent 85f5517 commit 5982ffd
Show file tree
Hide file tree
Showing 10 changed files with 173 additions and 81 deletions.
42 changes: 7 additions & 35 deletions component/extracted_alerts/master/collector_prometheus_alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,46 +9,18 @@ spec:
rules:
- alert: CollectorNodeDown
annotations:
message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
summary: "Collector cannot be scraped"
expr: |
up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0
for: 10m
labels:
service: collector
severity: critical
- alert: CollectorHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.001
for: 15m
labels:
service: collector
severity: critical
- alert: CollectorVeryHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.05
for: 15m
labels:
service: collector
severity: critical
- alert: ElasticsearchDeprecation
annotations:
message: "In Red Hat OpenShift Logging Operator 6.0, support for the Red Hat Elasticsearch Operator has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead."
summary: "Detected Elasticsearch as the in-cluster storage, which has been removed in 6.0 release"
description: "In Red Hat OpenShift Logging Operator 6.0, support for the Red Hat Elasticsearch Operator has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead."
summary: "Detected Elasticsearch as the in-cluster storage, which has been removed in the 6.0 release"
expr: |
sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0
for: 5m
Expand All @@ -58,8 +30,8 @@ spec:
namespace: openshift-logging
- alert: FluentdDeprecation
annotations:
message: "In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead."
summary: "Detected Fluentd as the collector, which has been removed in a 6.0 release"
description: "In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead."
summary: "Detected Fluentd as the collector, which has been removed in the 6.0 release"
expr: |
sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0
for: 5m
Expand All @@ -69,7 +41,7 @@ spec:
namespace: openshift-logging
- alert: KibanaDeprecation
annotations:
message: "In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead."
description: "In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead."
summary: "Detected Kibana as the log data visualization, which has been removed in the 6.0 release"
expr: |
sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0
Expand All @@ -80,7 +52,7 @@ spec:
namespace: openshift-logging
- alert: DiskBufferUsage
annotations:
message: "Collectors potentially consuming too much node disk, {{ $value }}% "
description: "Collectors potentially consuming too much node disk, {{ $value }}% "
summary: "Detected consuming too much node disk on $labels.hostname host"
expr: |
(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)')
Expand Down
18 changes: 18 additions & 0 deletions component/extracted_alerts/master/lokistack_prometheus_alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,24 @@ groups:
for: 15m
labels:
severity: warning
- alert: LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
summary: Loki is discarding samples during ingestion because they fail validation.
runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
- alert: LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,21 @@ groups:
for: 15m
labels:
severity: warning
- alert: LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
summary: Loki is discarding samples during ingestion because they fail validation.
runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,21 @@ groups:
for: 15m
labels:
severity: warning
- alert: LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
summary: Loki is discarding samples during ingestion because they fail validation.
runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,24 @@ groups:
for: 15m
labels:
severity: warning
- alert: LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
summary: Loki is discarding samples during ingestion because they fail validation.
runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
- alert: LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,27 @@ spec:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
summary: Loki is discarding samples during ingestion because they fail
validation.
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,27 @@ spec:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
summary: Loki is discarding samples during ingestion because they fail
validation.
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
rules:
- alert: SYN_CollectorNodeDown
annotations:
message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod
description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod
}} collector component for more than 10m.
summary: Collector cannot be scraped
expr: |
Expand All @@ -23,50 +23,14 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_CollectorHighErrorRate
annotations:
message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace
}}/{{ $labels.pod }} collector component.'
summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component
errors are high'
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.001
for: 15m
labels:
service: collector
severity: critical
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_CollectorVeryHighErrorRate
annotations:
message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace
}}/{{ $labels.pod }} collector component.'
summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component
errors are very high'
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.05
for: 15m
labels:
service: collector
severity: critical
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_ElasticsearchDeprecation
annotations:
message: In Red Hat OpenShift Logging Operator 6.0, support for the Red
Hat Elasticsearch Operator has been removed. Bug fixes and support are
provided only through the end of the 5.9 lifecycle. As an alternative
description: In Red Hat OpenShift Logging Operator 6.0, support for the
Red Hat Elasticsearch Operator has been removed. Bug fixes and support
are provided only through the end of the 5.9 lifecycle. As an alternative
to the Elasticsearch Operator, you can use the Loki Operator instead.
summary: Detected Elasticsearch as the in-cluster storage, which has been
removed in 6.0 release
removed in the 6.0 release
expr: |
sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0
for: 5m
Expand All @@ -78,12 +42,12 @@ spec:
syn_component: openshift4-logging
- alert: SYN_FluentdDeprecation
annotations:
message: In Red Hat OpenShift Logging Operator 6.0, support for Fluentd
description: In Red Hat OpenShift Logging Operator 6.0, support for Fluentd
as a collector has been removed. Bug fixes and support are provided
only through the end of the 5.9 lifecycle. As an alternative to Fluentd,
you can use the Vector collector instead.
summary: Detected Fluentd as the collector, which has been removed in
a 6.0 release
the 6.0 release
expr: |
sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0
for: 5m
Expand All @@ -95,7 +59,7 @@ spec:
syn_component: openshift4-logging
- alert: SYN_KibanaDeprecation
annotations:
message: In Red Hat OpenShift Logging Operator 6.0, support for Kibana
description: In Red Hat OpenShift Logging Operator 6.0, support for Kibana
as a data visualization dashboard has been removed. Bug fixes and support
are provided only through the end of the 5.9 lifecycle. As an alternative
to Kibana, you can use the Grafana Dashboard instead.
Expand All @@ -112,8 +76,8 @@ spec:
syn_component: openshift4-logging
- alert: SYN_DiskBufferUsage
annotations:
message: 'Collectors potentially consuming too much node disk, {{ $value
}}% '
description: 'Collectors potentially consuming too much node disk, {{
$value }}% '
summary: Detected consuming too much node disk on $labels.hostname host
expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\
\ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,27 @@ spec:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
summary: Loki is discarding samples during ingestion because they fail
validation.
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,27 @@ spec:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
summary: Loki is discarding samples during ingestion because they fail
validation.
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down

0 comments on commit 5982ffd

Please sign in to comment.