diff --git a/.cruft.json b/.cruft.json index 12d154e..321f781 100644 --- a/.cruft.json +++ b/.cruft.json @@ -1,13 +1,13 @@ { "template": "https://github.com/projectsyn/commodore-component-template.git", - "commit": "8840f87d25d97ce0d4bfed75d40173caaf4100fc", + "commit": "ff9d5a839714344345b76be069ea23e39e580f38", "checkout": "main", "context": { "cookiecutter": { "name": "OpenShift4 Logging", "slug": "openshift4-logging", "parameter_key": "openshift4_logging", - "test_cases": "defaults master elasticsearch multilineerr forwardingonly legacy", + "test_cases": "defaults master multilineerr forwardingonly", "add_lib": "n", "add_pp": "n", "add_golden": "y", diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 647a541..9209929 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -33,12 +33,9 @@ jobs: matrix: instance: - defaults - - release-5.4 - master - - elasticsearch - multilineerr - forwardingonly - - legacy defaults: run: working-directory: ${{ env.COMPONENT_NAME }} @@ -55,10 +52,8 @@ jobs: instance: - defaults - master - - elasticsearch - multilineerr - forwardingonly - - legacy defaults: run: working-directory: ${{ env.COMPONENT_NAME }} diff --git a/Makefile.vars.mk b/Makefile.vars.mk index 538e4d0..1c67426 100644 --- a/Makefile.vars.mk +++ b/Makefile.vars.mk @@ -57,4 +57,4 @@ KUBENT_IMAGE ?= ghcr.io/doitintl/kube-no-trouble:latest KUBENT_DOCKER ?= $(DOCKER_CMD) $(DOCKER_ARGS) $(root_volume) --entrypoint=/app/kubent $(KUBENT_IMAGE) instance ?= defaults -test_instances = tests/defaults.yml tests/master.yml tests/elasticsearch.yml tests/multilineerr.yml tests/forwardingonly.yml tests/legacy.yml +test_instances = tests/defaults.yml tests/master.yml tests/multilineerr.yml tests/forwardingonly.yml diff --git a/alerts.txt b/alerts.txt index 33bbfb7..66fc2a5 100644 --- a/alerts.txt +++ b/alerts.txt @@ -1,17 +1,7 @@ -https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.6/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.6/fluentd_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.7/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.7/fluentd_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.8/config/prometheus/collector_alerts.yaml release-5.8/collector_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.9/config/prometheus/collector_alerts.yaml release-5.9/collector_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-6.0/config/prometheus/collector_alerts.yaml release-6.0/collector_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-6.1/config/prometheus/collector_alerts.yaml release-6.1/collector_prometheus_alerts.yaml https://raw.githubusercontent.com/openshift/cluster-logging-operator/master/config/prometheus/collector_alerts.yaml master/collector_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.6/files/prometheus_alerts.yml release-5.6/elasticsearch_operator_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.7/files/prometheus_alerts.yml release-5.7/elasticsearch_operator_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.8/elasticsearch_operator_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.9/elasticsearch_operator_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/elasticsearch-operator/master/files/prometheus_alerts.yml master/elasticsearch_operator_prometheus_alerts.yaml - -https://raw.githubusercontent.com/openshift/loki/release-5.6/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.6/lokistack_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/loki/release-5.7/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.7/lokistack_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/loki/release-5.8/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.8/lokistack_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/loki/release-5.9/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.9/lokistack_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/loki/release-6.0/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-6.0/lokistack_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/loki/release-6.1/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-6.1/lokistack_prometheus_alerts.yaml https://raw.githubusercontent.com/openshift/loki/main/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml master/lokistack_prometheus_alerts.yaml diff --git a/class/defaults.yml b/class/defaults.yml index 6b91e13..7717ca9 100644 --- a/class/defaults.yml +++ b/class/defaults.yml @@ -8,9 +8,9 @@ parameters: "False": {} namespace: openshift-logging - version: '5.9' + + version: '6.1' channel: 'stable-${openshift4_logging:version}' - alerts: 'release-${openshift4_logging:version}' components: lokistack: @@ -42,16 +42,6 @@ parameters: ingestion: ingestionBurstSize: 9 ingestionRate: 5 - elasticsearch: - enabled: false - kibana_host: null - predict_elasticsearch_storage_alert: - enabled: true - lookback_range: 72h - predict_hours_from_now: 72 - threshold: 85 - for: 6h - severity: warning logmetrics: enabled: false spec: @@ -65,14 +55,15 @@ parameters: cpu: 200m memory: 128Mi - clusterLogging: {} clusterLogForwarder: {} - namespaceLogForwarderEnabled: false - namespaceLogForwarder: {} - secrets: {} + alerts: + release: 'release-${openshift4_logging:version}' + ignore: [] + patch: {} + operatorResources: clusterLogging: requests: @@ -86,12 +77,6 @@ parameters: cpu: 50m limits: memory: 512Mi - elasticsearch: - requests: - memory: 1Gi - cpu: 100m - limits: - memory: 1.5Gi images: kubectl: @@ -104,17 +89,4 @@ parameters: schedule: '*/10 * * * *' sleep_time: 2m - ignore_alerts: - - ElasticsearchHighFileDescriptorUsage - - ElasticsearchOperatorCSVNotSuccessful - - FluentdQueueLengthIncreasing - - patch_alerts: - FluentdQueueLengthIncreasing: - for: '12h' - - openshift4_elasticsearch_operator: - targetNamespaces: - - ${openshift4_logging:namespace} - openshift4_console: ${openshift4_logging:_openshift4_console:${openshift4_logging:components:lokistack:enabled}} diff --git a/component/alertrules.libsonnet b/component/alertrules.libsonnet index 2182703..3bb2ce8 100644 --- a/component/alertrules.libsonnet +++ b/component/alertrules.libsonnet @@ -2,12 +2,10 @@ local alertpatching = import 'lib/alert-patching.libsonnet'; local com = import 'lib/commodore.libjsonnet'; local kap = import 'lib/kapitan.libjsonnet'; local kube = import 'lib/kube.libjsonnet'; -local utils = import 'utils.libsonnet'; local inv = kap.inventory(); local params = inv.parameters.openshift4_logging; -local elasticsearch = inv.parameters.openshift4_logging.components.elasticsearch; -local loki = inv.parameters.openshift4_logging.components.lokistack; +local lokiEnabled = params.components.lokistack.enabled; local runbook(alertname) = 'https://hub.syn.tools/openshift4-logging/runbooks/%s.html' % alertname; @@ -16,63 +14,35 @@ assert std.member(inv.applications, 'openshift4-monitoring') : 'Component `openshift4-monitoring` not enabled'; -// Keep config backwards compatible -local predict_storage_alert = elasticsearch.predict_elasticsearch_storage_alert + ( - if std.objectHas(params, 'predict_elasticsearch_storage_alert') then - std.trace( - 'parameter predict_elasticsearch_storage_alert is deprecated, please use parameter `components.elasticsearch.predict_elasticsearch_storage_alert instead`', - com.makeMergeable(params.predict_elasticsearch_storage_alert) - ) - else {} -); - +// Upstream alerts to ignore // Keep only alerts from params.ignore_alerts for which the last // array entry wasn't prefixed with `~`. -local user_ignore_alerts = com.renderArray(params.ignore_alerts); - -// Upstream alerts to ignore local ignore_alerts = std.set( // Add set of upstream alerts that should be ignored from processed value of // `params.ignore_alerts` - user_ignore_alerts + local old_ignore = if std.objectHas(params, 'ignore_alerts') then + std.trace('Parameter `ignore_alerts` is deprecated, please migrate your config to `alerts.ignore`.', params.ignore_alerts) + else + []; + com.renderArray(params.alerts.ignore) + + com.renderArray(old_ignore) ); // Alert rule patches. // Provide partial objects for alert rules that need to be tuned compared to // upstream. The keys in this object correspond to the `alert` field of the // rule for which the patch is intended. -local patch_alerts = params.patch_alerts; +local patch_alerts = + local old_patch = if std.objectHas(params, 'patch_alerts') then + std.trace('Parameter `patch_alerts` is deprecated, please migrate your config to `alerts.patch`.', params.patch_alerts) + else + {}; + params.alerts.patch + old_patch; local loadFile(file) = - local fpath = 'openshift4-logging/component/extracted_alerts/%s/%s' % [ params.alerts, file ]; + local fpath = 'openshift4-logging/component/extracted_alerts/%s/%s' % [ params.alerts.release, file ]; std.parseJson(kap.yaml_load_stream(fpath)); - -// This will be processed by filter_patch_rules() as well -local predictESStorage = { - local alertName = 'ElasticsearchExpectNodeToReachDiskWatermark', - local hoursFromNow = predict_storage_alert.predict_hours_from_now, - local secondsFromNow = hoursFromNow * 3600, - alert: alertName, - annotations: { - message: ( - 'Expecting to reach disk low watermark at {{ $labels.node }} node in {{ $labels.cluster }} cluster in %s hours.' - + ' When reaching the watermark no new shards will be allocated to this node anymore. You should consider adding more disk to the node.' - ) % std.toString(hoursFromNow), - runbook_url: runbook('SYN_' + alertName), - summary: 'Expecting to Reach Disk Low Watermark in %s Hours' % std.toString(hoursFromNow), - }, - expr: ||| - sum by(cluster, instance, node) ( - (1 - (predict_linear(es_fs_path_available_bytes[%s], %s) / es_fs_path_total_bytes)) * 100 - ) > %s - ||| % [ predict_storage_alert.lookback_range, std.toString(secondsFromNow), std.toString(predict_storage_alert.threshold) ], - 'for': predict_storage_alert['for'], - labels: { - severity: predict_storage_alert.severity, - }, -}; - local renderRunbookBaseURL(group, baseURL) = { name: group.name, rules: std.map( @@ -103,7 +73,11 @@ local dropInfoRules = }; local prometheus_rules(name, groups, baseURL) = kube._Object('monitoring.coreos.com/v1', 'PrometheusRule', name) { - metadata+: { + metadata: { + labels: { + name: name, + }, + name: name, namespace: params.namespace, }, spec: { @@ -119,23 +93,6 @@ local prometheus_rules(name, groups, baseURL) = kube._Object('monitoring.coreos. }, }; - -// Elasticstack alerts - -local esStorageGroup = { - name: 'elasticsearch_node_storage.alerts', - rules: [ predictESStorage ], -}; -local fluentdGroup = if !utils.isVersion58 then loadFile('fluentd_prometheus_alerts.yaml')[0].groups else []; - -local esGroups = - loadFile('elasticsearch_operator_prometheus_alerts.yaml')[0].groups + - fluentdGroup + - [ - if predict_storage_alert.enabled then esStorageGroup, - ]; -local esBaseURL = 'https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md'; - // Lokistack alerts local lokiGroups = loadFile('lokistack_prometheus_alerts.yaml')[0].groups; @@ -146,7 +103,6 @@ local lokiBaseURL = 'https://github.com/grafana/loki/blob/main/operator/docs/lok local collectorGroups = loadFile('collector_prometheus_alerts.yaml')[0].spec.groups; { - [if elasticsearch.enabled then '60_elasticsearch_alerts']: prometheus_rules('syn-elasticsearch-logging-rules', esGroups, esBaseURL), - [if loki.enabled then '60_lokistack_alerts']: prometheus_rules('syn-loki-logging-rules', lokiGroups, lokiBaseURL), - [if utils.isVersion58 then '60_collector_alerts']: prometheus_rules('syn-collector-rules', collectorGroups, ''), + [if lokiEnabled then '60_lokistack_alerts']: prometheus_rules('syn-loki-logging-rules', lokiGroups, lokiBaseURL), + '60_collector_alerts': prometheus_rules('syn-collector-rules', collectorGroups, ''), } diff --git a/component/app.jsonnet b/component/app.jsonnet index 105fbf1..bd5d9f0 100644 --- a/component/app.jsonnet +++ b/component/app.jsonnet @@ -3,8 +3,14 @@ local inv = kap.inventory(); local params = inv.parameters.openshift4_logging; local argocd = import 'lib/argocd.libjsonnet'; -local app = argocd.App('openshift4-logging', params.namespace); - { - 'openshift4-logging': app, + 'openshift4-logging': argocd.App('openshift4-logging', params.namespace) { + spec+: { + syncPolicy+: { + syncOptions+: [ + 'ServerSideApply=true', + ], + }, + }, + }, } diff --git a/component/config_forwarding.libsonnet b/component/config_forwarding.libsonnet deleted file mode 100644 index c1cf1e5..0000000 --- a/component/config_forwarding.libsonnet +++ /dev/null @@ -1,318 +0,0 @@ -local com = import 'lib/commodore.libjsonnet'; -local kap = import 'lib/kapitan.libjsonnet'; -local kube = import 'lib/kube.libjsonnet'; -local lib = import 'lib/openshift4-logging.libsonnet'; -local utils = import 'utils.libsonnet'; - -local inv = kap.inventory(); -local params = inv.parameters.openshift4_logging; - -local deployLokistack = params.components.lokistack.enabled; -local deployElasticsearch = params.components.elasticsearch.enabled; -local forwardingOnly = !deployLokistack && !deployElasticsearch; - -// Make sure the default output is added to the pipelines `outputRefs`, -// if the logging stack is not disabled. -local pipelineOutputRefs(pipeline) = - local default = if forwardingOnly then [] else [ 'default' ]; - std.get(pipeline, 'forwarders', []) + default; - -// ----------------------------------------------------------------------------- -// Legacy Rendering -// ----------------------------------------------------------------------------- - -local legacyConfig = std.get(params, 'clusterLogForwarding', {}); -local hasLegacyConfig = if std.length(legacyConfig) > 0 then std.trace( - 'Parameter `clusterLogForwarding` is deprecated. Please update your config to use `clusterLogForwarder`', - true -) else false; - -// Apply default config for application logs. -local patchLegacyAppLogDefaults = { - local pipeline = std.get(legacyConfig, 'application_logs', { enabled: true }), - local pipelineOutputs = pipelineOutputRefs(pipeline), - local pipelineEnabled = std.length(pipelineOutputs) > 0, - - [if hasLegacyConfig then 'pipelines']: { - [if pipelineEnabled then 'application-logs']: { - inputRefs: [ 'application' ], - outputRefs: pipelineOutputs, - }, - }, -}; - -// Apply default config for infra logs. -local patchLegacyInfraLogDefaults = { - local pipeline = { enabled: true } + std.get(legacyConfig, 'infrastructure_logs', {}), - local pipelineOutputs = pipelineOutputRefs(pipeline), - local pipelineEnabled = pipeline.enabled && std.length(pipelineOutputs) > 0, - - [if hasLegacyConfig then 'pipelines']: { - [if pipelineEnabled then 'infrastructure-logs']: { - inputRefs: [ 'infrastructure' ], - outputRefs: pipelineOutputs, - }, - }, -}; - -// Apply default config for audit logs. -local patchLegacyAuditLogDefaults = { - local pipeline = std.get(legacyConfig, 'audit_logs', { enabled: false }), - local pipelineOutputs = pipelineOutputRefs(pipeline), - local pipelineEnabled = pipeline.enabled && std.length(pipelineOutputs) > 0, - - [if hasLegacyConfig then 'pipelines']: { - [if pipelineEnabled then 'audit-logs']: { - inputRefs: [ 'audit' ], - outputRefs: pipelineOutputs, - }, - }, -}; - -// Enable json parsing for default pipelines if configured. -local legacyEnableJson = std.get(std.get(legacyConfig, 'json', {}), 'enabled', false); -local patchLegacyJsonLogging = { - local enableAppLogs = std.get(std.get(legacyConfig, 'application_logs', {}), 'json', false), - local enableInfraLogs = std.get(std.get(legacyConfig, 'infrastructure_logs', {}), 'json', false), - - [if hasLegacyConfig then 'pipelines']: { - [if enableAppLogs then 'application-logs']: { parse: 'json' }, - [if enableInfraLogs then 'infrastructure-logs']: { parse: 'json' }, - }, - [if deployElasticsearch && legacyEnableJson then 'outputDefaults']: { - elasticsearch: { - structuredTypeKey: std.get(legacyConfig.json, 'typekey', 'kubernetes.labels.logFormat'), - structuredTypeName: std.get(legacyConfig.json, 'typename', 'nologformat'), - }, - }, -}; - -// Enable detectMultilineErrors for default pipelines if configured. -local patchLegacyMultilineErrors = { - local enableAppLogs = std.get(std.get(legacyConfig, 'application_logs', {}), 'detectMultilineErrors', false), - local enableInfraLogs = std.get(std.get(legacyConfig, 'infrastructure_logs', {}), 'detectMultilineErrors', false), - - [if hasLegacyConfig then 'pipelines']: { - [if enableAppLogs then 'application-logs']: { detectMultilineErrors: true }, - [if enableInfraLogs then 'infrastructure-logs']: { detectMultilineErrors: true }, - }, -}; - -// --- patch deprecated `clusterLogForwarding.namespace` config -local namespaceGroups = ( - if std.objectHas(legacyConfig, 'namespaces') then - { - [ns]: { - namespaces: [ ns ], - forwarders: [ legacyConfig.namespaces[ns].forwarder ], - } - for ns in std.objectFields(legacyConfig.namespaces) - } else {} -) + std.get(legacyConfig, 'namespace_groups', {}); -// --- patch end - -// Add inputs entry for every namespace_group defined in `clusterLogForwarding.namespace_groups`. -local patchLegacyCustomInputs = { - [if std.length(namespaceGroups) > 0 then 'inputs']: { - [group]: { - application: { - namespaces: namespaceGroups[group].namespaces, - }, - } - for group in std.objectFields(namespaceGroups) - if hasLegacyConfig - }, -}; - -// Add pipelines entry for every namespace_group defined in `clusterLogForwarding.namespace_groups`. -local patchLegacyCustomPipelines = { - [if std.length(namespaceGroups) > 0 then 'pipelines']: { - local enableJson = std.get(namespaceGroups[group], 'json', false), - local enableMultilineError = std.get(namespaceGroups[group], 'detectMultilineErrors', false), - - [group]: { - inputRefs: [ group ], - outputRefs: std.get(namespaceGroups[group], 'forwarders', []), - [if enableJson then 'parse']: 'json', - [if enableMultilineError then 'detectMultilineErrors']: true, - } - for group in std.objectFields(namespaceGroups) - if hasLegacyConfig - }, -}; - -// Add outputs entry for every forwarder defined in `clusterLogForwarding.forwarders`. -local patchLegacyCustomOutputs = { - [if std.length(std.get(legacyConfig, 'forwarders', {})) > 0 then 'outputs']: { - [name]: legacyConfig.forwarders[name] - for name in std.objectFields(legacyConfig.forwarders) - if hasLegacyConfig - }, -}; - -// ----------------------------------------------------------------------------- -// End Legacy Rendering -// ----------------------------------------------------------------------------- - -// Add defaults to pipelines config -local patchPipelineDefaults = { - local appsPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'application-logs', {}), - local infraPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'infrastructure-logs', {}), - local auditPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'audit-logs', {}), - - pipelines: { - [if !forwardingOnly || std.length(appsPipeline) > 0 then 'application-logs']: { - inputRefs: [ 'application' ], - outputRefs: pipelineOutputRefs(appsPipeline), - }, - [if !forwardingOnly || std.length(infraPipeline) > 0 then 'infrastructure-logs']: { - inputRefs: [ 'infrastructure' ], - outputRefs: pipelineOutputRefs(infraPipeline), - }, - [if std.length(auditPipeline) > 0 then 'audit-logs']: { - inputRefs: [ 'audit' ], - }, - }, -}; - -// clusterLogForwarderSpec: -// Consecutively apply patches to result of previous apply. -local clusterLogForwarderSpec = std.foldl( - // we use std.mergePatch here, because this way we don't need - // to make each patch object mergeable by suffixing all keys with a +. - function(manifest, patch) std.mergePatch(manifest, patch), - [ - patchPipelineDefaults, - // Apply legacy patches / defaults - patchLegacyAppLogDefaults, - patchLegacyInfraLogDefaults, - patchLegacyAuditLogDefaults, - patchLegacyJsonLogging, - patchLegacyMultilineErrors, - patchLegacyCustomInputs, - patchLegacyCustomOutputs, - patchLegacyCustomPipelines, - ], - { - inputs: {}, - outputs: {}, - pipelines: {}, - }, -) + com.makeMergeable(params.clusterLogForwarder); - -// Unfold objects into array for ClusterLogForwarder resource. -local unfoldSpecs(specs) = { - // Unfold objects into array. - [if std.length(specs.inputs) > 0 then 'inputs']: [ - { name: name } + specs.inputs[name] - for name in std.objectFields(specs.inputs) - ], - [if std.length(specs.outputs) > 0 then 'outputs']: [ - { name: name } + specs.outputs[name] - for name in std.objectFields(specs.outputs) - ], - [if std.length(specs.pipelines) > 0 then 'pipelines']: [ - { name: name } + specs.pipelines[name] - for name in std.objectFields(specs.pipelines) - ], -} + { - // Import remaining specs as is. - [key]: specs[key] - for key in std.objectFields(specs) - if !std.member([ 'inputs', 'outputs', 'pipelines' ], key) -}; - -// ClusterLogForwarder: -// Create definitive ClusterLogForwarder resource from specs. -local clusterLogForwarder = lib.ClusterLogForwarder(params.namespace, 'instance') { - metadata+: { - annotations+: { - 'argocd.argoproj.io/sync-options': 'SkipDryRunOnMissingResource=true', - }, - }, - spec: unfoldSpecs(clusterLogForwarderSpec), -}; - -// namespaceLogForwarderIgnoreKeys -// List of keys to ignore in namespaceLogForwarder -local namespaceLogForwarderIgnoreKeys = [ - 'instance', - 'openshift-logging/instance', -]; -// namespaceLogForwarder: -// Create namespaced LogForwarder resource from specs. -local namespaceLogForwarder = [ - local specs = { inputs: {}, outputs: {}, pipelines: {} } + com.makeMergeable(params.namespaceLogForwarder[forwarder]); - local name = utils.namespacedName(forwarder).name; - local namespace = utils.namespacedName(forwarder).namespace; - local serviceAccount = std.get(specs, 'serviceAccountName', utils.namespacedName(forwarder).name); - - lib.ClusterLogForwarder(namespace, name) { - metadata+: { - annotations+: { - 'argocd.argoproj.io/sync-options': 'SkipDryRunOnMissingResource=true', - }, - }, - spec: { serviceAccountName: serviceAccount } + com.makeMergeable(unfoldSpecs(specs)), - } - for forwarder in std.objectFields(params.namespaceLogForwarder) - if !std.member(namespaceLogForwarderIgnoreKeys, forwarder) -]; - -// namespaceServiceAccount: -// Create ServiceAccount for namespaced LogForwarder specs. -local namespaceServiceAccount = [ - local specs = params.namespaceLogForwarder[forwarder]; - local namespace = utils.namespacedName(forwarder).namespace; - local serviceAccount = std.get(specs, 'serviceAccountName', utils.namespacedName(forwarder).name); - - kube.ServiceAccount(serviceAccount) { - metadata+: { - namespace: namespace, - }, - } - for forwarder in std.objectFields(params.namespaceLogForwarder) - if !std.member(namespaceLogForwarderIgnoreKeys, forwarder) -]; - -// namespaceRoleBinding: -// Create RoleBinding for namespaced LogForwarder. -local namespaceRoleBinding = [ - local specs = params.namespaceLogForwarder[forwarder]; - local namespace = utils.namespacedName(forwarder).namespace; - local serviceAccount = std.get(specs, 'serviceAccountName', utils.namespacedName(forwarder).name); - - kube.RoleBinding(serviceAccount) { - metadata+: { - namespace: namespace, - }, - roleRef: { - apiGroup: 'rbac.authorization.k8s.io', - kind: 'ClusterRole', - name: 'collect-application-logs', - }, - subjects: [ { - kind: 'ServiceAccount', - name: serviceAccount, - namespace: namespace, - } ], - } - for forwarder in std.objectFields(params.namespaceLogForwarder) - if !std.member(namespaceLogForwarderIgnoreKeys, forwarder) -]; - -local enableLogForwarder = std.length(params.clusterLogForwarder) > 0 || std.get(legacyConfig, 'enabled', false); - -// Define outputs below -if enableLogForwarder then - { - '31_cluster_logforwarding': clusterLogForwarder, - [if std.length(params.namespaceLogForwarder) > 1 then '32_namespace_logforwarding']: namespaceLogForwarder, - [if std.length(params.namespaceLogForwarder) > 1 then '32_namespace_serviceaccount']: namespaceServiceAccount, - [if std.length(params.namespaceLogForwarder) > 1 then '32_namespace_rolebinding']: namespaceRoleBinding, - } -else - std.trace( - 'Log forwarding disabled, not deploying ClusterLogForwarder', - {} - ) diff --git a/component/config_logging.libsonnet b/component/config_logging.libsonnet deleted file mode 100644 index 1945fce..0000000 --- a/component/config_logging.libsonnet +++ /dev/null @@ -1,123 +0,0 @@ -local kap = import 'lib/kapitan.libjsonnet'; -local lib = import 'lib/openshift4-logging.libsonnet'; - -local inv = kap.inventory(); -local params = inv.parameters.openshift4_logging; - -local deployLokistack = params.components.lokistack.enabled; -local deployElasticsearch = params.components.elasticsearch.enabled; - -// Apply defaults for Lokistack. -local patchLokistackDefaults = { - [if deployLokistack then 'spec']: { - logStore: { - type: 'lokistack', - lokistack: { - name: 'loki', - }, - }, - }, -}; - -// Apply defaults for Elasticsearch. -local patchElasticsearchDefaults = { - [if deployElasticsearch then 'spec']: { - logStore: { - elasticsearch: { - nodeCount: 3, - storage: { - size: '200Gi', - }, - redundancyPolicy: 'SingleRedundancy', - nodeSelector: { - 'node-role.kubernetes.io/infra': '', - }, - }, - retentionPolicy: { - application: { - maxAge: '7d', - pruneNamespacesInterval: '15m', - }, - infra: { - maxAge: '30d', - pruneNamespacesInterval: '15m', - }, - audit: { - maxAge: '30d', - pruneNamespacesInterval: '15m', - }, - }, - }, - visualization: { - type: 'kibana', - kibana: { - replicas: 2, - nodeSelector: { - 'node-role.kubernetes.io/infra': '', - }, - }, - }, - }, -}; - -// Apply customisations from params.clusterLogging. -local patchLoggingConfig = { - spec: params.clusterLogging { - collection: { - // Don't include legacy config key 'collection.logs'. - [it]: params.clusterLogging.collection[it] - for it in std.objectFields(std.get(params.clusterLogging, 'collection', {})) - if it != 'logs' - }, - }, -}; - -// --- patch deprecated logging resource -local patchLegacyConfig = { - local legacyConfig = std.get(std.get(params.clusterLogging, 'collection', { collection: {} }), 'logs', {}), - local legacyType = std.get(legacyConfig, 'type', ''), - local legacyFluentd = std.get(legacyConfig, 'fluentd', {}), - - spec: { - collection: if std.length(legacyConfig) > 0 then std.trace( - 'Parameter `clusterLogging.collector.logs` is deprecated. Please update your config to use `clusterLogging.collector`', - { - [if legacyType != '' then 'type']: legacyType, - } + legacyFluentd, - ) else {}, - }, -}; -// --- patch end - - -// ClusterLogging specs: -// Consecutively apply patches to result of previous apply. -local clusterLogging = std.foldl( - // we use std.mergePatch here, because this way we don't need - // to make each patch object mergeable by suffixing all keys with a +. - function(manifest, patch) std.mergePatch(manifest, patch), - [ - patchLokistackDefaults, - patchElasticsearchDefaults, - patchLoggingConfig, - patchLegacyConfig, - ], - lib.ClusterLogging(params.namespace, 'instance') { - metadata+: { - annotations+: { - 'argocd.argoproj.io/sync-options': 'SkipDryRunOnMissingResource=true', - }, - }, - spec: { - managementState: 'Managed', - collection: { - type: 'vector', - }, - }, - } -); - -// Define outputs below -{ - '30_cluster_logging': clusterLogging, -} diff --git a/component/elasticsearch.libsonnet b/component/elasticsearch.libsonnet deleted file mode 100644 index 1e726b2..0000000 --- a/component/elasticsearch.libsonnet +++ /dev/null @@ -1,133 +0,0 @@ -// main template for openshift4-lokistack -local kap = import 'lib/kapitan.libjsonnet'; -local kube = import 'lib/kube.libjsonnet'; -local resourceLocker = import 'lib/resource-locker.libjsonnet'; - -// The hiera parameters for the component -local inv = kap.inventory(); -local params = inv.parameters.openshift4_logging; -local elasticsearch = inv.parameters.openshift4_logging.components.elasticsearch; - - -local machineconfig_journald = [ - kube._Object('machineconfiguration.openshift.io/v1', 'MachineConfig', '40-' + role + '-journald') { - metadata+: { - labels+: { - 'machineconfiguration.openshift.io/role': role, - }, - }, - spec: { - config: { - ignition: { - version: '2.2.0', - }, - storage: { - files: [ - { - contents: { - // See https://docs.openshift.com/container-platform/latest/logging/config/cluster-logging-systemd.html - source: 'data:text/plain;charset=utf-8;base64,' + std.base64(||| - MaxRetentionSec=1month - RateLimitBurst=10000 - RateLimitInterval=1s - Storage=persistent - SyncIntervalSec=1s - |||), - }, - filesystem: 'root', - mode: 420, - path: '/etc/systemd/journald.conf', - }, - ], - }, - }, - }, - } - for role in [ 'master', 'worker' ] -]; - -// Allow cluster-scoped ES operator to access ES pods in openshift-logging -local netpol_operator = kube.NetworkPolicy('allow-from-openshift-operators-redhat') { - spec: { - ingress: [ - { - from: [ - { - namespaceSelector: { - matchLabels: { - name: 'openshift-operators-redhat', - }, - }, - }, - { - podSelector: { - matchLabels: { - name: 'elasticsearch-operator', - }, - }, - }, - ], - }, - ], - podSelector: {}, - policyTypes: [ 'Ingress' ], - }, -}; - -// Keep config backwards compatible -local kibana_host = - if std.objectHas(params, 'kibana_host') then - std.trace( - 'parameter kibana_host is deprecated, please use parameter `components.elasticsearch.kibana_host instead`', - params.kibana_host - ) - else elasticsearch.kibana_host; - -local kibana_routeToPatch = kube._Object('route.openshift.io/v1', 'Route', 'kibana') { - metadata+: { - namespace: inv.parameters.openshift4_logging.namespace, - }, -}; - -local kibana_patch = resourceLocker.Patch(kibana_routeToPatch, { - spec: { - host: kibana_host, - }, -}); - -// OpenShift has custom RBAC permissions on routes if you want to set a host ┻━┻︵ヽ(`Д´)ノ︵ ┻━┻ -local kibana_patchWithAdditionalPermissions = std.map( - function(obj) - if obj.apiVersion == 'rbac.authorization.k8s.io/v1' && obj.kind == 'Role' then - obj { - rules+: [ - { - apiGroups: [ - 'route.openshift.io', - ], - resources: [ - 'routes/custom-host', - ], - verbs: [ - '*', - ], - }, - ], - } - else - obj - , kibana_patch -); - -// Define outputs below -if elasticsearch.enabled then - { - '40_es_machineconfig': machineconfig_journald, - '40_es_netpol': netpol_operator, - [if kibana_host != null then '40_es_kibana_host']: kibana_patchWithAdditionalPermissions, - } -else - std.trace( - 'Elasticsearch disabled, not deploying Elasticsearch stack', - {} - ) diff --git a/component/extracted_alerts/master/collector_prometheus_alerts.yaml b/component/extracted_alerts/master/collector_prometheus_alerts.yaml index 1942d35..2d5cdf8 100644 --- a/component/extracted_alerts/master/collector_prometheus_alerts.yaml +++ b/component/extracted_alerts/master/collector_prometheus_alerts.yaml @@ -9,7 +9,7 @@ spec: rules: - alert: CollectorNodeDown annotations: - message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." + description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." summary: "Collector cannot be scraped" expr: | up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 @@ -17,70 +17,9 @@ spec: labels: service: collector severity: critical - - alert: CollectorHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - - alert: CollectorVeryHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - - alert: ElasticsearchDeprecation - annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for the Red Hat Elasticsearch Operator has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead." - summary: "Detected Elasticsearch as the in-cluster storage, which has been removed in 6.0 release" - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - service: storage - severity: Warning - namespace: openshift-logging - - alert: FluentdDeprecation - annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead." - summary: "Detected Fluentd as the collector, which has been removed in a 6.0 release" - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - service: collector - severity: Warning - namespace: openshift-logging - - alert: KibanaDeprecation - annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead." - summary: "Detected Kibana as the log data visualization, which has been removed in the 6.0 release" - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - service: visualization - severity: Warning - namespace: openshift-logging - alert: DiskBufferUsage annotations: - message: "Collectors potentially consuming too much node disk, {{ $value }}% " + description: "Collectors potentially consuming too much node disk, {{ $value }}% " summary: "Detected consuming too much node disk on $labels.hostname host" expr: | (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') diff --git a/component/extracted_alerts/master/elasticsearch_operator_prometheus_alerts.yaml b/component/extracted_alerts/master/elasticsearch_operator_prometheus_alerts.yaml deleted file mode 100644 index 8f79010..0000000 --- a/component/extracted_alerts/master/elasticsearch_operator_prometheus_alerts.yaml +++ /dev/null @@ -1,224 +0,0 @@ ---- -"groups": -- "name": logging_elasticsearch.alerts - "rules": - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." - "summary": "Cluster health status is RED" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" - "expr": | - sum by (cluster) (es_cluster_status == 2) - "for": 7m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." - "summary": "Cluster health status is YELLOW" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" - "expr": | - sum by (cluster) (es_cluster_status == 1) - "for": 20m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchWriteRequestsRejectionJumps - "annotations": - "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." - "summary": "High Write Rejection Ratio - {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" - "expr": | - round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchJVMHeapUseHigh - "annotations": - "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "JVM Heap usage on the node is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" - "expr": | - sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": AggregatedLoggingSystemCPUHigh - "annotations": - "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "System CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_os_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchProcessCPUHigh - "annotations": - "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "ES process CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_process_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchDiskSpaceRunningLow - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." - "summary": "Cluster low on disk space" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" - "expr": | - sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 - "for": 1h - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchHighFileDescriptorUsage - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." - "summary": "Cluster low on file descriptors" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" - "expr": | - predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchOperatorCSVNotSuccessful - "annotations": - "message": "Elasticsearch Operator CSV has not reconciled succesfully." - "summary": "Elasticsearch Operator CSV Not Successful" - "expr": | - csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning \ No newline at end of file diff --git a/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml index 15cc424..799c280 100644 --- a/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml @@ -175,6 +175,24 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning - alert: LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/component/extracted_alerts/release-5.6/elasticsearch_operator_prometheus_alerts.yaml b/component/extracted_alerts/release-5.6/elasticsearch_operator_prometheus_alerts.yaml deleted file mode 100644 index 8f79010..0000000 --- a/component/extracted_alerts/release-5.6/elasticsearch_operator_prometheus_alerts.yaml +++ /dev/null @@ -1,224 +0,0 @@ ---- -"groups": -- "name": logging_elasticsearch.alerts - "rules": - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." - "summary": "Cluster health status is RED" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" - "expr": | - sum by (cluster) (es_cluster_status == 2) - "for": 7m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." - "summary": "Cluster health status is YELLOW" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" - "expr": | - sum by (cluster) (es_cluster_status == 1) - "for": 20m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchWriteRequestsRejectionJumps - "annotations": - "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." - "summary": "High Write Rejection Ratio - {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" - "expr": | - round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchJVMHeapUseHigh - "annotations": - "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "JVM Heap usage on the node is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" - "expr": | - sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": AggregatedLoggingSystemCPUHigh - "annotations": - "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "System CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_os_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchProcessCPUHigh - "annotations": - "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "ES process CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_process_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchDiskSpaceRunningLow - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." - "summary": "Cluster low on disk space" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" - "expr": | - sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 - "for": 1h - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchHighFileDescriptorUsage - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." - "summary": "Cluster low on file descriptors" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" - "expr": | - predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchOperatorCSVNotSuccessful - "annotations": - "message": "Elasticsearch Operator CSV has not reconciled succesfully." - "summary": "Elasticsearch Operator CSV Not Successful" - "expr": | - csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning \ No newline at end of file diff --git a/component/extracted_alerts/release-5.6/fluentd_prometheus_alerts.yaml b/component/extracted_alerts/release-5.6/fluentd_prometheus_alerts.yaml deleted file mode 100644 index 7772c47..0000000 --- a/component/extracted_alerts/release-5.6/fluentd_prometheus_alerts.yaml +++ /dev/null @@ -1,64 +0,0 @@ - -"groups": -- "name": "logging_fluentd.alerts" - "rules": - - "alert": "FluentdNodeDown" - "annotations": - "message": "Prometheus could not scrape fluentd {{ $labels.container }} for more than 10m." - "summary": "Fluentd cannot be scraped" - "expr": | - up{job = "collector", container = "collector"} == 0 or absent(up{job="collector", container="collector"}) == 1 - "for": "10m" - "labels": - "service": "collector" - "severity": "critical" - namespace: "openshift-logging" - - "alert": "FluentdQueueLengthIncreasing" - "annotations": - "message": "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." - "summary": "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." - "expr": | - sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) - "for": "1h" - "labels": - "service": "collector" - "severity": "Warning" - namespace: "openshift-logging" - - alert: FluentDHighErrorRate - annotations: - message: |- - {{ $value }}% of records have resulted in an error by fluentd {{ $labels.instance }}. - summary: FluentD output errors are high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 10 - for: 15m - labels: - severity: warning - namespace: "openshift-logging" - - alert: FluentDVeryHighErrorRate - annotations: - message: |- - {{ $value }}% of records have resulted in an error by fluentd {{ $labels.instance }}. - summary: FluentD output errors are very high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 25 - for: 15m - labels: - severity: critical - namespace: "openshift-logging" -- "name": "logging_clusterlogging_telemetry.rules" - "rules": - - "expr": | - sum by(cluster)(log_collected_bytes_total) - "record": "cluster:log_collected_bytes_total:sum" - - "expr": | - sum by(cluster)(log_logged_bytes_total) - "record": "cluster:log_logged_bytes_total:sum" diff --git a/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml deleted file mode 100644 index f378c49..0000000 --- a/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml +++ /dev/null @@ -1,177 +0,0 @@ ---- -groups: -- name: logging_loki.alerts - rules: - - alert: LokiRequestErrors - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - summary: "At least 10% of requests are responded by 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Errors" - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code=~"5.."} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiStackWriteRequestErrors - annotations: - message: |- - {{ printf "%.2f" $value }}% of write requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. - summary: "At least 10% of write requests to the lokistack-gateway are responded with 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#LokiStack-Write-Request-Errors" - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler="push"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler="push"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiStackReadRequestErrors - annotations: - message: |- - {{ printf "%.2f" $value }}% of query requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. - summary: "At least 10% of query requests to the lokistack-gateway are responded with 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#LokiStack-Read-Request-Errors" - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: |- - {{ $labels.job }} is experiencing an increase of {{ $value }} panics. - summary: "A panic was triggered." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Panics" - expr: | - sum( - increase( - loki_panic_total[10m] - ) - ) by (job, namespace) - > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - summary: "The 99th percentile is experiencing high latency (higher than 1 second)." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Latency" - expr: | - histogram_quantile(0.99, - sum( - irate( - loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[1m] - ) - ) by (job, le, namespace, route) - ) - > 1 - for: 15m - labels: - severity: critical - - alert: LokiTenantRateLimit - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing 429 errors. - summary: "At least 10% of requests are responded with the rate limit error code." - runbook_url: "[[ .RunbookURL ]]#Loki-Tenant-Rate-Limit" - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429"} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: warning - - alert: LokiStorageSlowWrite - annotations: - message: |- - The storage path is experiencing slow write response rates. - summary: "The storage path is experiencing slow write response rates." - runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Write" - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="WRITE"} - ) by (job, le, namespace) - ) - > 1 - for: 15m - labels: - severity: warning - - alert: LokiStorageSlowRead - annotations: - message: |- - The storage path is experiencing slow read response rates. - summary: "The storage path is experiencing slow read response rates." - runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Read" - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="Shipper.Query"} - ) by (job, le, namespace) - ) - > 5 - for: 15m - labels: - severity: warning - - alert: LokiWritePathHighLoad - annotations: - message: |- - The write path is experiencing high load. - summary: "The write path is experiencing high load, causing backpressure storage flushing." - runbook_url: "[[ .RunbookURL ]]#Loki-Write-Path-High-Load" - expr: | - sum( - loki_ingester_wal_replay_flushing - ) by (job, namespace) - > 0 - for: 15m - labels: - severity: warning - - alert: LokiReadPathHighLoad - annotations: - message: |- - The read path is experiencing high load. - summary: "The read path has high volume of queries, causing longer response times." - runbook_url: "[[ .RunbookURL ]]#Loki-Read-Path-High-Load" - expr: | - histogram_quantile(0.99, - sum( - rate( - loki_logql_querystats_latency_seconds_bucket[5m] - ) - ) by (job, le, namespace) - ) - > 30 - for: 15m - labels: - severity: warning diff --git a/component/extracted_alerts/release-5.7/elasticsearch_operator_prometheus_alerts.yaml b/component/extracted_alerts/release-5.7/elasticsearch_operator_prometheus_alerts.yaml deleted file mode 100644 index 8f79010..0000000 --- a/component/extracted_alerts/release-5.7/elasticsearch_operator_prometheus_alerts.yaml +++ /dev/null @@ -1,224 +0,0 @@ ---- -"groups": -- "name": logging_elasticsearch.alerts - "rules": - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." - "summary": "Cluster health status is RED" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" - "expr": | - sum by (cluster) (es_cluster_status == 2) - "for": 7m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." - "summary": "Cluster health status is YELLOW" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" - "expr": | - sum by (cluster) (es_cluster_status == 1) - "for": 20m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchWriteRequestsRejectionJumps - "annotations": - "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." - "summary": "High Write Rejection Ratio - {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" - "expr": | - round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchJVMHeapUseHigh - "annotations": - "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "JVM Heap usage on the node is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" - "expr": | - sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": AggregatedLoggingSystemCPUHigh - "annotations": - "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "System CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_os_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchProcessCPUHigh - "annotations": - "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "ES process CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_process_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchDiskSpaceRunningLow - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." - "summary": "Cluster low on disk space" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" - "expr": | - sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 - "for": 1h - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchHighFileDescriptorUsage - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." - "summary": "Cluster low on file descriptors" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" - "expr": | - predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchOperatorCSVNotSuccessful - "annotations": - "message": "Elasticsearch Operator CSV has not reconciled succesfully." - "summary": "Elasticsearch Operator CSV Not Successful" - "expr": | - csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning \ No newline at end of file diff --git a/component/extracted_alerts/release-5.7/fluentd_prometheus_alerts.yaml b/component/extracted_alerts/release-5.7/fluentd_prometheus_alerts.yaml deleted file mode 100644 index 7772c47..0000000 --- a/component/extracted_alerts/release-5.7/fluentd_prometheus_alerts.yaml +++ /dev/null @@ -1,64 +0,0 @@ - -"groups": -- "name": "logging_fluentd.alerts" - "rules": - - "alert": "FluentdNodeDown" - "annotations": - "message": "Prometheus could not scrape fluentd {{ $labels.container }} for more than 10m." - "summary": "Fluentd cannot be scraped" - "expr": | - up{job = "collector", container = "collector"} == 0 or absent(up{job="collector", container="collector"}) == 1 - "for": "10m" - "labels": - "service": "collector" - "severity": "critical" - namespace: "openshift-logging" - - "alert": "FluentdQueueLengthIncreasing" - "annotations": - "message": "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." - "summary": "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." - "expr": | - sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) - "for": "1h" - "labels": - "service": "collector" - "severity": "Warning" - namespace: "openshift-logging" - - alert: FluentDHighErrorRate - annotations: - message: |- - {{ $value }}% of records have resulted in an error by fluentd {{ $labels.instance }}. - summary: FluentD output errors are high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 10 - for: 15m - labels: - severity: warning - namespace: "openshift-logging" - - alert: FluentDVeryHighErrorRate - annotations: - message: |- - {{ $value }}% of records have resulted in an error by fluentd {{ $labels.instance }}. - summary: FluentD output errors are very high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 25 - for: 15m - labels: - severity: critical - namespace: "openshift-logging" -- "name": "logging_clusterlogging_telemetry.rules" - "rules": - - "expr": | - sum by(cluster)(log_collected_bytes_total) - "record": "cluster:log_collected_bytes_total:sum" - - "expr": | - sum by(cluster)(log_logged_bytes_total) - "record": "cluster:log_logged_bytes_total:sum" diff --git a/component/extracted_alerts/release-5.8/collector_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/collector_prometheus_alerts.yaml deleted file mode 100644 index c4f1663..0000000 --- a/component/extracted_alerts/release-5.8/collector_prometheus_alerts.yaml +++ /dev/null @@ -1,71 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: collector - namespace: openshift-logging -spec: - groups: - - name: logging_collector.alerts - rules: - - alert: CollectorNodeDown - annotations: - message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." - summary: "Collector cannot be scraped" - expr: | - up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 - for: 10m - labels: - service: collector - severity: critical - - alert: CollectorHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - - alert: CollectorVeryHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - - alert: FluentdQueueLengthIncreasing - annotations: - message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." - summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." - expr: | - sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) - for: 1h - labels: - service: collector - severity: Warning - - name: logging_clusterlogging_telemetry.rules - rules: - - expr: | - sum by(cluster)(log_collected_bytes_total) - record: cluster:log_collected_bytes_total:sum - - expr: | - sum by(cluster)(log_logged_bytes_total) - record: cluster:log_logged_bytes_total:sum - - expr: | - sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m])) - record: collector:log_num_errors:sum_rate - - expr: | - sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m])) - record: collector:received_events:sum_rate diff --git a/component/extracted_alerts/release-5.8/elasticsearch_operator_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/elasticsearch_operator_prometheus_alerts.yaml deleted file mode 100644 index 8f79010..0000000 --- a/component/extracted_alerts/release-5.8/elasticsearch_operator_prometheus_alerts.yaml +++ /dev/null @@ -1,224 +0,0 @@ ---- -"groups": -- "name": logging_elasticsearch.alerts - "rules": - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." - "summary": "Cluster health status is RED" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" - "expr": | - sum by (cluster) (es_cluster_status == 2) - "for": 7m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." - "summary": "Cluster health status is YELLOW" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" - "expr": | - sum by (cluster) (es_cluster_status == 1) - "for": 20m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchWriteRequestsRejectionJumps - "annotations": - "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." - "summary": "High Write Rejection Ratio - {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" - "expr": | - round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchJVMHeapUseHigh - "annotations": - "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "JVM Heap usage on the node is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" - "expr": | - sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": AggregatedLoggingSystemCPUHigh - "annotations": - "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "System CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_os_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchProcessCPUHigh - "annotations": - "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "ES process CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_process_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchDiskSpaceRunningLow - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." - "summary": "Cluster low on disk space" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" - "expr": | - sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 - "for": 1h - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchHighFileDescriptorUsage - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." - "summary": "Cluster low on file descriptors" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" - "expr": | - predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchOperatorCSVNotSuccessful - "annotations": - "message": "Elasticsearch Operator CSV has not reconciled succesfully." - "summary": "Elasticsearch Operator CSV Not Successful" - "expr": | - csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning \ No newline at end of file diff --git a/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml deleted file mode 100644 index f378c49..0000000 --- a/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml +++ /dev/null @@ -1,177 +0,0 @@ ---- -groups: -- name: logging_loki.alerts - rules: - - alert: LokiRequestErrors - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - summary: "At least 10% of requests are responded by 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Errors" - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code=~"5.."} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiStackWriteRequestErrors - annotations: - message: |- - {{ printf "%.2f" $value }}% of write requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. - summary: "At least 10% of write requests to the lokistack-gateway are responded with 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#LokiStack-Write-Request-Errors" - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler="push"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler="push"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiStackReadRequestErrors - annotations: - message: |- - {{ printf "%.2f" $value }}% of query requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. - summary: "At least 10% of query requests to the lokistack-gateway are responded with 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#LokiStack-Read-Request-Errors" - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: |- - {{ $labels.job }} is experiencing an increase of {{ $value }} panics. - summary: "A panic was triggered." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Panics" - expr: | - sum( - increase( - loki_panic_total[10m] - ) - ) by (job, namespace) - > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - summary: "The 99th percentile is experiencing high latency (higher than 1 second)." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Latency" - expr: | - histogram_quantile(0.99, - sum( - irate( - loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[1m] - ) - ) by (job, le, namespace, route) - ) - > 1 - for: 15m - labels: - severity: critical - - alert: LokiTenantRateLimit - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing 429 errors. - summary: "At least 10% of requests are responded with the rate limit error code." - runbook_url: "[[ .RunbookURL ]]#Loki-Tenant-Rate-Limit" - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429"} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: warning - - alert: LokiStorageSlowWrite - annotations: - message: |- - The storage path is experiencing slow write response rates. - summary: "The storage path is experiencing slow write response rates." - runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Write" - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="WRITE"} - ) by (job, le, namespace) - ) - > 1 - for: 15m - labels: - severity: warning - - alert: LokiStorageSlowRead - annotations: - message: |- - The storage path is experiencing slow read response rates. - summary: "The storage path is experiencing slow read response rates." - runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Read" - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="Shipper.Query"} - ) by (job, le, namespace) - ) - > 5 - for: 15m - labels: - severity: warning - - alert: LokiWritePathHighLoad - annotations: - message: |- - The write path is experiencing high load. - summary: "The write path is experiencing high load, causing backpressure storage flushing." - runbook_url: "[[ .RunbookURL ]]#Loki-Write-Path-High-Load" - expr: | - sum( - loki_ingester_wal_replay_flushing - ) by (job, namespace) - > 0 - for: 15m - labels: - severity: warning - - alert: LokiReadPathHighLoad - annotations: - message: |- - The read path is experiencing high load. - summary: "The read path has high volume of queries, causing longer response times." - runbook_url: "[[ .RunbookURL ]]#Loki-Read-Path-High-Load" - expr: | - histogram_quantile(0.99, - sum( - rate( - loki_logql_querystats_latency_seconds_bucket[5m] - ) - ) by (job, le, namespace) - ) - > 30 - for: 15m - labels: - severity: warning diff --git a/component/extracted_alerts/release-5.9/collector_prometheus_alerts.yaml b/component/extracted_alerts/release-5.9/collector_prometheus_alerts.yaml deleted file mode 100644 index 30ee172..0000000 --- a/component/extracted_alerts/release-5.9/collector_prometheus_alerts.yaml +++ /dev/null @@ -1,115 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: collector - namespace: openshift-logging -spec: - groups: - - name: logging_collector.alerts - rules: - - alert: CollectorNodeDown - annotations: - message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." - summary: "Collector cannot be scraped" - expr: | - up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 - for: 10m - labels: - service: collector - severity: critical - - alert: CollectorHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - - alert: CollectorVeryHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - - alert: FluentdQueueLengthIncreasing - annotations: - message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." - summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." - expr: | - sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) - for: 1h - labels: - service: collector - severity: Warning - - alert: ElasticsearchDeprecation - annotations: - message: "The OpenShift Elasticsearch Operator is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to using the OpenShift Elasticsearch Operator to manage the default log storage, you can use the Loki Operator." - summary: "Detected Elasticsearch as the in-cluster storage which is deprecated and will be removed in a future release." - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - service: storage - severity: Warning - namespace: openshift-logging - - alert: FluentdDeprecation - annotations: - message: "Fluentd is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to Fluentd, you can use Vector instead." - summary: "Detected Fluentd as the collector which is deprecated and will be removed in a future release." - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - service: collector - severity: Warning - namespace: openshift-logging - - alert: KibanaDeprecation - annotations: - message: "The Kibana web console is now deprecated and is planned to be removed in a future logging release." - summary: "Detected Kibana as the visualization which is deprecated and will be removed in a future release." - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - service: visualization - severity: Warning - namespace: openshift-logging - - alert: DiskBufferUsage - annotations: - message: "Collectors potentially consuming too much node disk, {{ $value }}% " - summary: "Detected consuming too much node disk on $labels.hostname host" - expr: | - (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') - / on(instance) group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'})) * 100 > 15 - for: 5m - labels: - service: collector - severity: Warning - - name: logging_clusterlogging_telemetry.rules - rules: - - expr: | - sum by(cluster)(log_collected_bytes_total) - record: cluster:log_collected_bytes_total:sum - - expr: | - sum by(cluster)(log_logged_bytes_total) - record: cluster:log_logged_bytes_total:sum - - expr: | - sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m])) - record: collector:log_num_errors:sum_rate - - expr: | - sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m])) - record: collector:received_events:sum_rate diff --git a/component/extracted_alerts/release-5.9/elasticsearch_operator_prometheus_alerts.yaml b/component/extracted_alerts/release-5.9/elasticsearch_operator_prometheus_alerts.yaml deleted file mode 100644 index 8f79010..0000000 --- a/component/extracted_alerts/release-5.9/elasticsearch_operator_prometheus_alerts.yaml +++ /dev/null @@ -1,224 +0,0 @@ ---- -"groups": -- "name": logging_elasticsearch.alerts - "rules": - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." - "summary": "Cluster health status is RED" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" - "expr": | - sum by (cluster) (es_cluster_status == 2) - "for": 7m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." - "summary": "Cluster health status is YELLOW" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" - "expr": | - sum by (cluster) (es_cluster_status == 1) - "for": 20m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchWriteRequestsRejectionJumps - "annotations": - "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." - "summary": "High Write Rejection Ratio - {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" - "expr": | - round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchJVMHeapUseHigh - "annotations": - "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "JVM Heap usage on the node is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" - "expr": | - sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": AggregatedLoggingSystemCPUHigh - "annotations": - "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "System CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_os_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchProcessCPUHigh - "annotations": - "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "ES process CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_process_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchDiskSpaceRunningLow - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." - "summary": "Cluster low on disk space" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" - "expr": | - sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 - "for": 1h - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchHighFileDescriptorUsage - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." - "summary": "Cluster low on file descriptors" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" - "expr": | - predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchOperatorCSVNotSuccessful - "annotations": - "message": "Elasticsearch Operator CSV has not reconciled succesfully." - "summary": "Elasticsearch Operator CSV Not Successful" - "expr": | - csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning \ No newline at end of file diff --git a/component/extracted_alerts/release-6.0/collector_prometheus_alerts.yaml b/component/extracted_alerts/release-6.0/collector_prometheus_alerts.yaml new file mode 100644 index 0000000..2d5cdf8 --- /dev/null +++ b/component/extracted_alerts/release-6.0/collector_prometheus_alerts.yaml @@ -0,0 +1,45 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: collector + namespace: openshift-logging +spec: + groups: + - name: logging_collector.alerts + rules: + - alert: CollectorNodeDown + annotations: + description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." + summary: "Collector cannot be scraped" + expr: | + up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 + for: 10m + labels: + service: collector + severity: critical + - alert: DiskBufferUsage + annotations: + description: "Collectors potentially consuming too much node disk, {{ $value }}% " + summary: "Detected consuming too much node disk on $labels.hostname host" + expr: | + (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') + / on(instance) group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'})) * 100 > 15 + for: 5m + labels: + service: collector + severity: Warning + - name: logging_clusterlogging_telemetry.rules + rules: + - expr: | + sum by(cluster)(log_logged_bytes_total) + record: cluster:log_logged_bytes_total:sum + - expr: | + sum by(pod, namespace, app_kubernetes_io_instance)(rate(vector_component_errors_total[2m])) + record: collector:log_num_errors:sum_rate + - expr: | + sum by(pod, namespace, app_kubernetes_io_instance)(rate(vector_component_received_events_total[2m])) + record: collector:received_events:sum_rate + + + + diff --git a/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-6.0/lokistack_prometheus_alerts.yaml similarity index 89% rename from component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml rename to component/extracted_alerts/release-6.0/lokistack_prometheus_alerts.yaml index 15cc424..799c280 100644 --- a/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/release-6.0/lokistack_prometheus_alerts.yaml @@ -175,6 +175,24 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning - alert: LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/component/extracted_alerts/release-6.1/collector_prometheus_alerts.yaml b/component/extracted_alerts/release-6.1/collector_prometheus_alerts.yaml new file mode 100644 index 0000000..2d5cdf8 --- /dev/null +++ b/component/extracted_alerts/release-6.1/collector_prometheus_alerts.yaml @@ -0,0 +1,45 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: collector + namespace: openshift-logging +spec: + groups: + - name: logging_collector.alerts + rules: + - alert: CollectorNodeDown + annotations: + description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." + summary: "Collector cannot be scraped" + expr: | + up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 + for: 10m + labels: + service: collector + severity: critical + - alert: DiskBufferUsage + annotations: + description: "Collectors potentially consuming too much node disk, {{ $value }}% " + summary: "Detected consuming too much node disk on $labels.hostname host" + expr: | + (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') + / on(instance) group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'})) * 100 > 15 + for: 5m + labels: + service: collector + severity: Warning + - name: logging_clusterlogging_telemetry.rules + rules: + - expr: | + sum by(cluster)(log_logged_bytes_total) + record: cluster:log_logged_bytes_total:sum + - expr: | + sum by(pod, namespace, app_kubernetes_io_instance)(rate(vector_component_errors_total[2m])) + record: collector:log_num_errors:sum_rate + - expr: | + sum by(pod, namespace, app_kubernetes_io_instance)(rate(vector_component_received_events_total[2m])) + record: collector:received_events:sum_rate + + + + diff --git a/component/extracted_alerts/release-5.7/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-6.1/lokistack_prometheus_alerts.yaml similarity index 79% rename from component/extracted_alerts/release-5.7/lokistack_prometheus_alerts.yaml rename to component/extracted_alerts/release-6.1/lokistack_prometheus_alerts.yaml index f378c49..799c280 100644 --- a/component/extracted_alerts/release-5.7/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/release-6.1/lokistack_prometheus_alerts.yaml @@ -175,3 +175,37 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + - alert: LokistackSchemaUpgradesRequired + annotations: + message: |- + The LokiStack "{{ $labels.stack_name }}" in namespace "{{ $labels.stack_namespace }}" is using a storage schema + configuration that does not contain the latest schema version. It is recommended to update the schema + configuration to update the schema version to the latest version in the future. + summary: "One or more of the deployed LokiStacks contains an outdated storage schema configuration." + runbook_url: "[[ .RunbookURL ]]#Lokistack-Schema-Upgrades-Required" + expr: | + sum ( + lokistack_status_condition{reason="StorageNeedsSchemaUpdate",status="true"} + ) by (stack_namespace, stack_name) + > 0 + for: 1m + labels: + severity: warning diff --git a/component/log_forwarder.libsonnet b/component/log_forwarder.libsonnet new file mode 100644 index 0000000..d9aa3a4 --- /dev/null +++ b/component/log_forwarder.libsonnet @@ -0,0 +1,168 @@ +local com = import 'lib/commodore.libjsonnet'; +local kap = import 'lib/kapitan.libjsonnet'; +local kube = import 'lib/kube.libjsonnet'; + +local inv = kap.inventory(); +local params = inv.parameters.openshift4_logging; +local lokiEnabled = params.components.lokistack.enabled; +local forwarderEnabled = lokiEnabled || std.length(params.clusterLogForwarder) > 0; + +// Make sure the default output is added to the pipelines `outputRefs`, +// if the logging stack is not disabled. +local pipelineOutputRefs(pipeline) = + local default = if lokiEnabled then [ 'default' ] else []; + std.get(pipeline, 'forwarders', []) + default; + +// clusterLogForwarderSpec: +// Consecutively apply patches to result of previous apply. +local clusterLogForwarderSpec = { + local appsPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'application-logs', {}), + local infraPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'infrastructure-logs', {}), + local auditPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'audit-logs', {}), + + managementState: 'Managed', + collector: { + resources: { + requests: { + cpu: '20m', + memory: '400M', + }, + }, + tolerations: [ { + key: 'storagenode', + operator: 'Exists', + } ], + }, + serviceAccount: { + name: 'logcollector', + }, + filters: {}, + inputs: {}, + outputs: {}, + pipelines: { + [if lokiEnabled || std.length(appsPipeline) > 0 then 'application-logs']: { + inputRefs: [ 'application' ], + outputRefs: pipelineOutputRefs(appsPipeline), + }, + [if lokiEnabled || std.length(infraPipeline) > 0 then 'infrastructure-logs']: { + inputRefs: [ 'infrastructure' ], + outputRefs: pipelineOutputRefs(infraPipeline), + }, + [if std.length(auditPipeline) > 0 then 'audit-logs']: { + inputRefs: [ 'audit' ], + }, + }, +} + com.makeMergeable(params.clusterLogForwarder); + +// Unfold objects into array for ClusterLogForwarder resource. +local unfoldSpecs(specs) = { + // Unfold objects into array. + [if std.length(specs.inputs) > 0 then 'inputs']: [ + { name: name } + specs.inputs[name] + for name in std.objectFields(specs.inputs) + ], + [if std.length(specs.outputs) > 0 then 'outputs']: [ + { name: name } + specs.outputs[name] + for name in std.objectFields(specs.outputs) + ], + [if std.length(specs.pipelines) > 0 then 'pipelines']: [ + { name: name } + specs.pipelines[name] + for name in std.objectFields(specs.pipelines) + ], +} + { + // Import remaining specs as is. + [key]: specs[key] + for key in std.objectFields(specs) + if !std.member([ 'filters', 'inputs', 'outputs', 'pipelines' ], key) +}; + +// ClusterLogForwarder: +// Create definitive ClusterLogForwarder resource from specs. +local clusterLogForwarder = kube._Object('observability.openshift.io/v1', 'ClusterLogForwarder', 'instance') { + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-options': 'SkipDryRunOnMissingResource=true', + }, + namespace: params.namespace, + }, + spec: unfoldSpecs(clusterLogForwarderSpec), +}; + +// Collector ServiceAccount +// Create a ServiceAccount and ClusterRoleBindings for collector pods. +local rbac = [ + kube.ServiceAccount('logcollector') { + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-50', + }, + namespace: params.namespace, + }, + }, + kube._Object('rbac.authorization.k8s.io/v1', 'ClusterRoleBinding', 'logcollector-application-logs') { + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-50', + }, + namespace: params.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'collect-application-logs', + }, + subjects: [ { + kind: 'ServiceAccount', + name: 'logcollector', + namespace: params.namespace, + } ], + }, + kube._Object('rbac.authorization.k8s.io/v1', 'ClusterRoleBinding', 'logcollector-infrastructure-logs') { + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-50', + }, + namespace: params.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'collect-infrastructure-logs', + }, + subjects: [ { + kind: 'ServiceAccount', + name: 'logcollector', + namespace: params.namespace, + } ], + }, + kube._Object('rbac.authorization.k8s.io/v1', 'ClusterRoleBinding', 'logcollector-audit-logs') { + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-50', + }, + namespace: params.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'collect-audit-logs', + }, + subjects: [ { + kind: 'ServiceAccount', + name: 'logcollector', + namespace: params.namespace, + } ], + }, +]; + +// Define outputs below +if forwarderEnabled then + { + '40_log_forwarder': clusterLogForwarder, + '40_log_forwarder_rbac': rbac, + } +else + std.trace( + 'Log forwarding disabled, not deploying ClusterLogForwarder', + {} + ) diff --git a/component/loki.libsonnet b/component/log_lokistack.libsonnet similarity index 85% rename from component/loki.libsonnet rename to component/log_lokistack.libsonnet index 09e406f..3b9da34 100644 --- a/component/loki.libsonnet +++ b/component/log_lokistack.libsonnet @@ -3,7 +3,6 @@ local com = import 'lib/commodore.libjsonnet'; local kap = import 'lib/kapitan.libjsonnet'; local kube = import 'lib/kube.libjsonnet'; local po = import 'lib/patch-operator.libsonnet'; -local workaround = import 'loki_workaround.libsonnet'; // The hiera parameters for the component local inv = kap.inventory(); @@ -50,6 +49,7 @@ local lokistack_spec = { local lokistack = kube._Object('loki.grafana.com/v1', 'LokiStack', 'loki') { metadata+: { annotations+: { + 'argocd.argoproj.io/sync-wave': '-50', // Allow ArgoCD to do the dry run when the CRD doesn't exist yet 'argocd.argoproj.io/sync-options': 'SkipDryRunOnMissingResource=true', }, @@ -117,16 +117,34 @@ local aggregate_loki_log_access = kube.ClusterRole('syn:loki:cluster-reader') { ], }; +// Console Log Plugin +local console_plugin = kube._Object('observability.openshift.io/v1alpha1', 'UIPlugin', 'logging') { + metadata: { + labels: { + name: 'logging', + }, + name: 'logging', + }, + spec: { + type: 'Logging', + logging: { + lokiStack: { + name: 'loki', + }, + logsLimit: 50, + timeout: '30s', + }, + }, +}; + // Define outputs below if loki.enabled then { - '50_loki_stack': lokistack, - '50_loki_logstore': logstore, - '50_loki_netpol': [ netpol_viewplugin, netpol_lokigateway ], - '50_loki_rbac': [ aggregate_loki_log_access ], - '50_loki_operator_metrics_token': workaround.missing_metrics_token, - '50_loki_ingester_fix': workaround.ingester_stuck, - '50_loki_logreader_fix': workaround.app_logs_reader, + '30_loki_stack': lokistack, + '30_loki_logstore': logstore, + '30_loki_netpol': [ netpol_viewplugin, netpol_lokigateway ], + '30_loki_rbac': [ aggregate_loki_log_access ], + '30_loki_plugin': console_plugin, } else std.trace( diff --git a/component/logmetrics.libsonnet b/component/log_metricsexporter.libsonnet similarity index 85% rename from component/logmetrics.libsonnet rename to component/log_metricsexporter.libsonnet index 78db4f4..117eee6 100644 --- a/component/logmetrics.libsonnet +++ b/component/log_metricsexporter.libsonnet @@ -5,6 +5,11 @@ local inv = kap.inventory(); local logmetrics = inv.parameters.openshift4_logging.components.logmetrics; local logMetricExporter = kube._Object('logging.openshift.io/v1alpha1', 'LogFileMetricExporter', 'instance') { + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-50', + }, + }, spec: logmetrics.spec, }; diff --git a/component/loki_workaround.libsonnet b/component/log_workaround.libsonnet similarity index 92% rename from component/loki_workaround.libsonnet rename to component/log_workaround.libsonnet index 2f2e0f0..ed9772a 100644 --- a/component/loki_workaround.libsonnet +++ b/component/log_workaround.libsonnet @@ -5,6 +5,7 @@ local kube = import 'lib/kube.libjsonnet'; // The hiera parameters for the component local inv = kap.inventory(); local params = inv.parameters.openshift4_logging; +local lokiEnabled = params.components.lokistack.enabled; // Generate missing metrics SA token for Loki Operator. @@ -146,8 +147,15 @@ local ingester_stuck = [ }, ]; -{ - missing_metrics_token: [ missing_metrics_token ], - ingester_stuck: ingester_stuck, - app_logs_reader: app_logs_reader, -} +// Define outputs below +if lokiEnabled then + { + '50_fix_missing_metrics_token': missing_metrics_token, + '50_fix_ingester_stuck': ingester_stuck, + '50_fix_app_logs_reader': app_logs_reader, + } +else + std.trace( + 'Lokistack disabled, not deploying Lokistack', + {} + ) diff --git a/component/main.jsonnet b/component/main.jsonnet index 84edfa1..f78d491 100644 --- a/component/main.jsonnet +++ b/component/main.jsonnet @@ -2,13 +2,10 @@ local com = import 'lib/commodore.libjsonnet'; local kap = import 'lib/kapitan.libjsonnet'; local kube = import 'lib/kube.libjsonnet'; local operatorlib = import 'lib/openshift4-operators.libsonnet'; -local utils = import 'utils.libsonnet'; local inv = kap.inventory(); local params = inv.parameters.openshift4_logging; - -local deployLokistack = params.components.lokistack.enabled; -local deployElasticsearch = params.components.elasticsearch.enabled; +local lokiEnabled = params.components.lokistack.enabled; // Namespace @@ -16,6 +13,7 @@ local namespace = kube.Namespace(params.namespace) { metadata+: { annotations+: { 'openshift.io/node-selector': '', + 'argocd.argoproj.io/sync-wave': '-100', }, labels+: { 'openshift.io/cluster-monitoring': 'true', @@ -27,13 +25,11 @@ local namespace = kube.Namespace(params.namespace) { local operatorGroup = operatorlib.OperatorGroup('cluster-logging') { metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-90', + }, namespace: params.namespace, }, - spec: { - [if !params.namespaceLogForwarderEnabled then 'targetNamespaces']: [ - params.namespace, - ], - }, }; // Subscriptions @@ -44,6 +40,11 @@ local logging = operatorlib.namespacedSubscription( params.channel, 'redhat-operators' ) { + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-80', + }, + }, spec+: { config+: { resources: params.operatorResources.clusterLogging, @@ -51,11 +52,16 @@ local logging = operatorlib.namespacedSubscription( }, }; -local lokistack = if deployLokistack then operatorlib.managedSubscription( +local lokistack = if lokiEnabled then operatorlib.managedSubscription( 'openshift-operators-redhat', 'loki-operator', params.channel ) { + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-80', + }, + }, spec+: { config+: { resources: params.operatorResources.lokistack, @@ -63,17 +69,14 @@ local lokistack = if deployLokistack then operatorlib.managedSubscription( }, }; -// With version 5.9 of the logging stack, elasticsearch is deprecated, -// this will clamp elasticsearch-operator subscription to stable-5.8. -local esChannel = if utils.isVersion59 then 'stable-5.8' else params.channel; -local elasticsearch = if deployElasticsearch then operatorlib.managedSubscription( +local observability = if lokiEnabled then operatorlib.managedSubscription( 'openshift-operators-redhat', - 'elasticsearch-operator', - esChannel + 'cluster-observability-operator', + 'development' ) { - spec+: { - config+: { - resources: params.operatorResources.elasticsearch, + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-80', }, }, }; @@ -81,7 +84,7 @@ local elasticsearch = if deployElasticsearch then operatorlib.managedSubscriptio local subscriptions = std.filter(function(it) it != null, [ logging, lokistack, - elasticsearch, + observability, ]); local secrets = com.generateResources(params.secrets, kube.Secret); @@ -93,9 +96,8 @@ local secrets = com.generateResources(params.secrets, kube.Secret); '20_subscriptions': subscriptions, [if std.length(params.secrets) > 0 then '99_secrets']: secrets, } -+ (import 'config_logging.libsonnet') -+ (import 'config_forwarding.libsonnet') -+ (import 'loki.libsonnet') -+ (import 'elasticsearch.libsonnet') ++ (import 'log_lokistack.libsonnet') ++ (import 'log_forwarder.libsonnet') ++ (import 'log_metricsexporter.libsonnet') ++ (import 'log_workaround.libsonnet') + (import 'alertrules.libsonnet') -+ (import 'logmetrics.libsonnet') diff --git a/component/utils.libsonnet b/component/utils.libsonnet deleted file mode 100644 index 2a6ccad..0000000 --- a/component/utils.libsonnet +++ /dev/null @@ -1,33 +0,0 @@ -local kap = import 'lib/kapitan.libjsonnet'; -local kube = import 'lib/kube.libjsonnet'; - -local inv = kap.inventory(); -local params = inv.parameters.openshift4_logging; - -local isVersion58 = - local major = std.split(params.version, '.')[0]; - local minor = std.split(params.version, '.')[1]; - if major == 'master' then true - else if std.parseInt(major) >= 6 then true - else if std.parseInt(major) == 5 && std.parseInt(minor) >= 8 then true - else false; - -local isVersion59 = - local major = std.split(params.version, '.')[0]; - local minor = std.split(params.version, '.')[1]; - if major == 'master' then true - else if std.parseInt(major) >= 6 then true - else if std.parseInt(major) == 5 && std.parseInt(minor) >= 9 then true - else false; - -local namespacedName(name) = { - local namespaced = std.splitLimit(name, '/', 1), - namespace: if std.length(namespaced) > 1 then namespaced[0] else params.namespace, - name: if std.length(namespaced) > 1 then namespaced[1] else namespaced[0], -}; - -{ - isVersion58: isVersion58, - isVersion59: isVersion59, - namespacedName: namespacedName, -} diff --git a/docs/modules/ROOT/pages/how-tos/upgrade-v4.x-v5.x.adoc b/docs/modules/ROOT/pages/how-tos/upgrade-v4.x-v5.x.adoc new file mode 100644 index 0000000..15006ad --- /dev/null +++ b/docs/modules/ROOT/pages/how-tos/upgrade-v4.x-v5.x.adoc @@ -0,0 +1,19 @@ += Upgrade from v4.x to v5.x + +The parameter `clusterLogForwarding` is deprecated. +Backward compatibility for `clusterLogForwarding` has been removed. + +Several parameters in `clusterLogForwarder` have changed. +See xref:references/parameters.adoc#_examples[Examples] and https://docs.openshift.com/container-platform/latest/observability/logging/logging-6.0/log6x-clf.html#structure-of-the-clusterlogforwarder[Structure of the ClusterLogForwarder] for reference. + +The parameter `clusterLogging` is deprecated and has been removed. +Migrate collector configuration to `clusterLogForwarder.collector`. + +The parameter `namespaceLogForwarderEnabled` is deprecated and has been removed. +Multi LogForwarder is enabled by default, see xref:how-tos/enable-multi-logforwarder.adoc[How-To] for migration if it was not enabled. + +The parameter `namespaceLogForwarder` is deprecated and has been removed. +If you are using Multi LogForwarder with ProjectSyn, migrate them to `adhoc-configuration`. + +The parameters `ignore_alerts` and `patch_alerts` are deprecated. +The component is backwards compatible, but moving the parameters to `alerts.ignore` and `alerts.patch` is highly encouraged. diff --git a/docs/modules/ROOT/pages/references/parameters.adoc b/docs/modules/ROOT/pages/references/parameters.adoc index 1d96b31..f1ffba8 100644 --- a/docs/modules/ROOT/pages/references/parameters.adoc +++ b/docs/modules/ROOT/pages/references/parameters.adoc @@ -38,17 +38,15 @@ default:: `stable-${openshift4_logging:version}` Channel of the operator subscription to use. If you specify the logging stack version through parameter `version`, you shouldn't need to modify this parameter. -In OpenShift 4.7, RedHat decoupled the logging stack version from the OpenShift version. -The decoupled logging stack versions start at version 5.0. -With version 5.1 of the logging stack, channels for specific minor versions were introduced. - Ideally we would just default to the `stable` channel, as that channel will always be backed by a logging stack version compatible with the OpenShift cluster version by the OpenShift marketplace operator. -However, since there's potential for changes in configuration between logging stack versions which need to be managed through the component, we default to using the `stable-5.x` channel matching the version specified in parameter `version`. +However, since there's potential for changes in configuration between logging stack versions which need to be managed through the component, we default to using the `stable-6.x` channel matching the version specified in parameter `version`. -See the https://docs.openshift.com/container-platform/latest/logging/cluster-logging-deploying.html#cluster-logging-deploy-cli_cluster-logging-deploying[OpenShift documentation] for details. +See the https://docs.openshift.com/container-platform/latest/observability/logging/logging-6.0/log6x-upgrading-to-6.html[OpenShift documentation] for details. == `alerts` +` +=== `release` [horizontal] type:: string @@ -70,7 +68,7 @@ Because of this, the component no longer automatically supports new versions of ==== -== `ignore_alerts` +=== `alerts.ignore` [horizontal] type:: list @@ -80,130 +78,13 @@ This parameter can be used to disable alerts provided by openshift cluster-loggi The component supports removing entries in this parameter by providing the entry prefixed with `~`. -== `patch_alerts` +=== `alerts.patch` [horizontal] type:: dictionary -default:: -+ -[source,yaml] ----- -patch_alerts: - FluentdQueueLengthIncreasing: - for: '12h' ----- - -The parameter patch_alerts allows users to customize upstream alerts. - - -== `components.elasticsearch` - -[horizontal] -type:: dictionary -default:: -+ -[source,yaml] ----- -components: - elasticsearch: - enabled: false - kibana_host: null - predict_elasticsearch_storage_alert: - enabled: true - lookback_range: 72h - predict_hours_from_now: 72 - threshold: 85 - for: 6h - severity: warning ----- - -Configuration of the elasticsearch component. - -IMPORTANT: Elasticsearch is deprecated. - -=== `components.elasticsearch.kibana_host` - -[horizontal] -type:: string -default:: `null` -example:: `kibana.apps.cluster.syn.tools` - -Host name of the Kibana route. - - -=== `components.elasticsearch.predict_elasticsearch_storage_alert` - -[horizontal] -type:: dict -example:: -+ -[source,yaml] ----- -components: - elasticsearch: - predict_elasticsearch_storage_alert: - enabled: true - lookback_range: 72h - predict_hours_from_now: 72 - threshold: 85 - for: 6h - severity: warning ----- - -Create an alert `SYN_ElasticsearchExpectNodeToReachDiskWatermark` if the storage allocated for Elasticsearch is predicted to reach the low storage watermark. - -==== `components.elasticsearch.predict_elasticsearch_storage_alert.enabled` - -[horizontal] -type:: boolean -default:: `true` - -Enable or disable this alert. - -==== `components.elasticsearch.predict_elasticsearch_storage_alert.lookback_range` - -[horizontal] -type:: prometheus duration -default:: `72h` - -How for to look back to calculate the prediction. - - -==== `components.elasticsearch.predict_elasticsearch_storage_alert.predict_hours_from_now` - -[horizontal] -type:: number -default:: `72` - -How far in the future the prediction is calculated. - - -==== `components.elasticsearch.predict_elasticsearch_storage_alert.threshold` - -[horizontal] -type:: number -default:: `85` - -The threshold for the alert. -Percentage of disk fill. - - -==== `components.elasticsearch.predict_elasticsearch_storage_alert.for` - -[horizontal] -type:: prometheus duration -default:: `6h` - -The alert is firing once the threshold has been reached for this long. - - -==== `components.elasticsearch.predict_elasticsearch_storage_alert.severity` - -[horizontal] -type:: string -default:: `warning` +default:: {} -The severity of the fired alert. +The parameter `alerts.patch` allows users to customize upstream alerts. == `components.lokistack` @@ -345,27 +226,7 @@ spec: <1> configure nodeSelector <2> configure resources -See the https://docs.openshift.com/container-platform/latest/observability/logging/log_collection_forwarding/cluster-logging-collector.html#creating-logfilesmetricexporter_cluster-logging-collector[LogCollection Docs] for available specs. - - -== `operatorResources` - -[horizontal] -type:: dictionary -default:: see `defaults.yml` - -A dictionary holding the `.spec.config.resources` for OLM subscriptions maintained by this component. - - -== `clusterLogging` - -[horizontal] -type:: dictionary -default:: {} - -A dictionary holding the `.spec` for cluster logging. - -See the https://docs.openshift.com/container-platform/latest/observability/logging/cluster-logging-deploying.html#create-cluster-logging-cli_cluster-logging-deploying[OpenShift docs] for available parameters. +Exporter to collect metrics about container logs being produced in a kubernetes environment It publishes `log_logged_bytes_total` metric in prometheus. == `clusterLogForwarder` @@ -376,32 +237,7 @@ default:: {} A dictionary holding the `.spec` for cluster log forwarding. -See the https://docs.openshift.com/container-platform/latest/observability/logging/log_collection_forwarding/log-forwarding.html[OpenShift docs] for available parameters. - - -== `clusterLogForwarding` - -IMPORTANT: `clusterLogForwarding` is deprecated, please use `clusterLogForwarder` - - -== `namespaceLogForwarderEnabled` - -[horizontal] -type:: bool -default:: false - -NOTE: Enabling namespaced log forwarding requires redeploying the logging operator. See xref:how-tos/enable-multi-forwarder.adoc[How-To] for instructions. - - -== `namespaceLogForwarder` - -[horizontal] -type:: dictionary -default:: {} - -A dictionary holding the `.spec` for namespaced log forwarding. - -See in examples below for configuration. +See the https://docs.openshift.com/container-platform/latest/observability/logging/logging-6.1/log6x-clf-6.1.html[OpenShift docs] for available parameters. == `secrets` @@ -415,41 +251,16 @@ The key is the name of the secret, the value is the content of the secret. The value must be a dict with a key `stringData` which is a dict of key/value pairs to add to the secret. -== Examples +== `operatorResources` -[source,yaml] ----- -clusterLogging: - logStore: - retentionPolicy: - application: - maxAge: 15d - elasticsearch: - nodeCount: 5 ----- +[horizontal] +type:: dictionary +default:: see `defaults.yml` -=== Use namespaced ClusterLogForwarder +A dictionary holding the `.spec.config.resources` for OLM subscriptions maintained by this component. -Example creates a `ClusterLogForwarder`, `ServiceAccount` and `RoleBinding` in namespace `my-namespace`. -[source,yaml] ----- -namespaceLogForwarderEnabled: true -namespaceLogForwarder: - my-namespace/my-forwarder: - outputs: - splunk-forwarder: - secret: - name: splunk-forwarder - type: fluentdForward - url: tls://splunk-forwarder:24224 - pipelines: - application-logs: - inputRefs: - - application - outputRefs: - - splunk-forwarder ----- +== Examples === Forward logs for all application logs to third-party @@ -458,14 +269,23 @@ namespaceLogForwarder: clusterLogForwarder: outputs: splunk-forwarder: - secret: - name: splunk-forwarder - type: fluentdForward - url: tls://splunk-forwarder:24224 + type: splunk + splunk: + authentication: + token: + key: hecToken + secretName: splunk-forwarder + url: https://splunk-server:8088 pipelines: application-logs: outputRefs: - splunk-forwarder + +secrets: + splunk-forwarder: + type: Opaque + stringData: + hecToken: 'super-secret-token' ---- === Forward logs for certain namespaces to third-party @@ -480,43 +300,23 @@ clusterLogForwarder: - my-namespace outputs: splunk-forwarder: - secret: - name: splunk-forwarder - type: fluentdForward - url: tls://splunk-forwarder:24224 + type: splunk + splunk: + authentication: + token: + key: hecToken + secretName: splunk-forwarder + url: https://splunk-server:8088 pipelines: my-apps: inputRefs: - my-apps outputRefs: - splunk-forwarder ----- - -=== Enable JSON parsing for all application logs - -[source,yaml] ----- -clusterLogForwarder: - pipelines: - application-logs: - parse: json ----- -=== Enable JSON parsing for certain namespaces - -[source,yaml] ----- -clusterLogForwarder: - inputs: - my-apps: - application: - namespaces: - - my-namespace - pipelines: - my-apps: - inputRefs: - - my-apps - outputRefs: - - default - parse: json +secrets: + splunk-forwarder: + type: Opaque + stringData: + hecToken: 'super-secret-token' ---- diff --git a/docs/modules/ROOT/pages/runbooks/SYN_ElasticsearchExpectNodeToReachDiskWatermark.adoc b/docs/modules/ROOT/pages/runbooks/SYN_ElasticsearchExpectNodeToReachDiskWatermark.adoc deleted file mode 100644 index 36e9424..0000000 --- a/docs/modules/ROOT/pages/runbooks/SYN_ElasticsearchExpectNodeToReachDiskWatermark.adoc +++ /dev/null @@ -1,14 +0,0 @@ -= Alert rule: SYN_ElasticsearchExpectNodeToReachDiskWatermark - -include::partial$runbooks/contribution_note.adoc[] - -== icon:glasses[] Overview - -This alert fires when the Elasticsearch node storage utilization is expected to reach the disk low watermark. -The default watermark is 85%. -The node will become read-only at the watermark. -To resolve this alert, unused data should be deleted or the https://kb.vshn.ch/oc4/how-tos/logging/increase-elasticsearch-storage-size.html[disk size must be increased]. - -== icon:bug[] Steps for debugging - -// Add detailed steps to debug and resolve the issue diff --git a/docs/modules/ROOT/partials/nav.adoc b/docs/modules/ROOT/partials/nav.adoc index 36f5603..8014d33 100644 --- a/docs/modules/ROOT/partials/nav.adoc +++ b/docs/modules/ROOT/partials/nav.adoc @@ -8,6 +8,7 @@ * xref:how-tos/upgrade-v1.x-v2.x.adoc[Upgrade from v1.x to v2.x] * xref:how-tos/upgrade-v2.x-v3.x.adoc[Upgrade from v2.x to v3.x] * xref:how-tos/upgrade-v3.x-v4.x.adoc[Upgrade from v3.x to v4.x] +* xref:how-tos/upgrade-v4.x-v5.x.adoc[Upgrade from v4.x to v5.x] * xref:how-tos/switch-to-lokistack.adoc[Switch to Lokistack] * xref:how-tos/enable-multi-forwarder.adoc[Enable Multi LogForwarder] diff --git a/lib/openshift4-logging.libsonnet b/lib/openshift4-logging.libsonnet deleted file mode 100644 index dd2a061..0000000 --- a/lib/openshift4-logging.libsonnet +++ /dev/null @@ -1,18 +0,0 @@ -local kube = import 'lib/kube.libjsonnet'; - -local ClusterLogging(namespace, name) = kube._Object('logging.openshift.io/v1', 'ClusterLogging', name) { - metadata+: { - namespace: namespace, - }, -}; - -local ClusterLogForwarder(namespace, name) = kube._Object('logging.openshift.io/v1', 'ClusterLogForwarder', name) { - metadata+: { - namespace: namespace, - }, -}; - -{ - ClusterLogging: ClusterLogging, - ClusterLogForwarder: ClusterLogForwarder, -} diff --git a/tests/defaults.yml b/tests/defaults.yml index fc9eccd..a8c82ec 100644 --- a/tests/defaults.yml +++ b/tests/defaults.yml @@ -16,7 +16,6 @@ parameters: input_paths: - tests/console-patch.jsonnet output_path: console-patching/ - openshift4_operators: defaultInstallPlanApproval: Automatic defaultSource: openshift-operators-redhat diff --git a/tests/elasticsearch.yml b/tests/elasticsearch.yml deleted file mode 100644 index 8532afe..0000000 --- a/tests/elasticsearch.yml +++ /dev/null @@ -1,35 +0,0 @@ -applications: - - openshift4-operators as openshift-operators-redhat - - openshift4-monitoring - -parameters: - kapitan: - dependencies: - - type: https - source: https://raw.githubusercontent.com/appuio/component-openshift4-operators/v1.0.2/lib/openshift4-operators.libsonnet - output_path: vendor/lib/openshift4-operators.libsonnet - - type: https - source: https://raw.githubusercontent.com/appuio/component-openshift4-monitoring/v2.9.0/lib/openshift4-monitoring-alert-patching.libsonnet - output_path: vendor/lib/alert-patching.libsonnet - compile: - - input_type: jsonnet - input_paths: - - tests/console-patch.jsonnet - output_path: console-patching/ - - openshift4_operators: - defaultInstallPlanApproval: Automatic - defaultSource: openshift-operators-redhat - defaultSourceNamespace: openshift-operators-redhat - - openshift4_logging: - components: - lokistack: - enabled: false - elasticsearch: - enabled: true - clusterLogging: - collection: - type: fluentd - logStore: - type: elasticsearch diff --git a/tests/forwardingonly.yml b/tests/forwardingonly.yml index cfcbe5d..1d72626 100644 --- a/tests/forwardingonly.yml +++ b/tests/forwardingonly.yml @@ -26,5 +26,3 @@ parameters: components: lokistack: enabled: false - elasticsearch: - enabled: false diff --git a/tests/golden/defaults/openshift4-logging/apps/openshift4-logging.yaml b/tests/golden/defaults/openshift4-logging/apps/openshift4-logging.yaml index e69de29..6825b97 100644 --- a/tests/golden/defaults/openshift4-logging/apps/openshift4-logging.yaml +++ b/tests/golden/defaults/openshift4-logging/apps/openshift4-logging.yaml @@ -0,0 +1,4 @@ +spec: + syncPolicy: + syncOptions: + - ServerSideApply=true diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/00_namespace.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/00_namespace.yaml index 1b27cf9..4c91e3c 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/00_namespace.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/00_namespace.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Namespace metadata: annotations: + argocd.argoproj.io/sync-wave: '-100' openshift.io/node-selector: '' labels: name: openshift-logging diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/10_operator_group.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/10_operator_group.yaml index ff11675..52f645d 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/10_operator_group.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/10_operator_group.yaml @@ -1,11 +1,9 @@ apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-90' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging -spec: - targetNamespaces: - - openshift-logging diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml index 1f0b7ad..1abaf03 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml @@ -1,13 +1,14 @@ apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging spec: - channel: stable-5.9 + channel: stable-6.1 config: resources: limits: @@ -23,13 +24,14 @@ spec: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: loki-operator name: loki-operator namespace: openshift-operators-redhat spec: - channel: stable-5.9 + channel: stable-6.1 config: resources: limits: @@ -41,3 +43,19 @@ spec: name: loki-operator source: openshift-operators-redhat sourceNamespace: openshift-operators-redhat +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-80' + labels: + name: cluster-observability-operator + name: cluster-observability-operator + namespace: openshift-operators-redhat +spec: + channel: development + installPlanApproval: Automatic + name: cluster-observability-operator + source: openshift-operators-redhat + sourceNamespace: openshift-operators-redhat diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/30_cluster_logging.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_cluster_logging.yaml deleted file mode 100644 index 307f0ca..0000000 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/30_cluster_logging.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogging -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: instance - name: instance - namespace: openshift-logging -spec: - collection: - type: vector - logStore: - lokistack: - name: loki - type: lokistack - managementState: Managed diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_logstore.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_logstore.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_logstore.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_logstore.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_netpol.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_netpol.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_netpol.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_netpol.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_plugin.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_plugin.yaml new file mode 100644 index 0000000..3128c2f --- /dev/null +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_plugin.yaml @@ -0,0 +1,13 @@ +apiVersion: observability.openshift.io/v1alpha1 +kind: UIPlugin +metadata: + labels: + name: logging + name: logging +spec: + logging: + logsLimit: 50 + lokiStack: + name: loki + timeout: 30s + type: Logging diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_rbac.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_rbac.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_rbac.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_rbac.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_stack.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_stack.yaml similarity index 97% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_stack.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_stack.yaml index 259068c..f859742 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_stack.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_stack.yaml @@ -3,6 +3,7 @@ kind: LokiStack metadata: annotations: argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + argocd.argoproj.io/sync-wave: '-50' labels: name: loki name: loki diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/40_log_forwarder.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/40_log_forwarder.yaml new file mode 100644 index 0000000..944ed23 --- /dev/null +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/40_log_forwarder.yaml @@ -0,0 +1,32 @@ +apiVersion: observability.openshift.io/v1 +kind: ClusterLogForwarder +metadata: + annotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + labels: + name: instance + name: instance + namespace: openshift-logging +spec: + collector: + resources: + requests: + cpu: 20m + memory: 400M + tolerations: + - key: storagenode + operator: Exists + managementState: Managed + pipelines: + - inputRefs: + - application + name: application-logs + outputRefs: + - default + - inputRefs: + - infrastructure + name: infrastructure-logs + outputRefs: + - default + serviceAccount: + name: logcollector diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/40_log_forwarder_rbac.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/40_log_forwarder_rbac.yaml new file mode 100644 index 0000000..cac68e1 --- /dev/null +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/40_log_forwarder_rbac.yaml @@ -0,0 +1,63 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector + name: logcollector + namespace: openshift-logging +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector-application-logs + name: logcollector-application-logs + namespace: openshift-logging +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: collect-application-logs +subjects: + - kind: ServiceAccount + name: logcollector + namespace: openshift-logging +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector-infrastructure-logs + name: logcollector-infrastructure-logs + namespace: openshift-logging +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: collect-infrastructure-logs +subjects: + - kind: ServiceAccount + name: logcollector + namespace: openshift-logging +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector-audit-logs + name: logcollector-audit-logs + namespace: openshift-logging +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: collect-audit-logs +subjects: + - kind: ServiceAccount + name: logcollector + namespace: openshift-logging diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_app_logs_reader.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_app_logs_reader.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_ingester_stuck.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_ingester_stuck.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_missing_metrics_token.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_missing_metrics_token.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml index 268663f..4d23850 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -1,7 +1,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - annotations: {} labels: name: syn-collector-rules name: syn-collector-rules @@ -12,7 +11,7 @@ spec: rules: - alert: SYN_CollectorNodeDown annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m. summary: Collector cannot be scraped expr: | @@ -23,97 +22,10 @@ spec: severity: critical syn: 'true' syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchDeprecation - annotations: - message: The OpenShift Elasticsearch Operator is deprecated and is planned - to be removed in a future release. Red Hat provides bug fixes and support - for this feature during the current release lifecycle, but this feature - no longer receives enhancements. As an alternative to using the OpenShift - Elasticsearch Operator to manage the default log storage, you can use - the Loki Operator. - summary: Detected Elasticsearch as the in-cluster storage which is deprecated - and will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: storage - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentdDeprecation - annotations: - message: Fluentd is deprecated and is planned to be removed in a future - release. Red Hat provides bug fixes and support for this feature during - the current release lifecycle, but this feature no longer receives enhancements. - As an alternative to Fluentd, you can use Vector instead. - summary: Detected Fluentd as the collector which is deprecated and will - be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_KibanaDeprecation - annotations: - message: The Kibana web console is now deprecated and is planned to be - removed in a future logging release. - summary: Detected Kibana as the visualization which is deprecated and - will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: visualization - severity: Warning - syn: 'true' - syn_component: openshift4-logging - alert: SYN_DiskBufferUsage annotations: - message: 'Collectors potentially consuming too much node disk, {{ $value - }}% ' + description: 'Collectors potentially consuming too much node disk, {{ + $value }}% ' summary: Detected consuming too much node disk on $labels.hostname host expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\ \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\ diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..614581b 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -1,7 +1,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - annotations: {} labels: name: syn-loki-logging-rules name: syn-loki-logging-rules @@ -204,6 +203,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/golden/elasticsearch/openshift4-logging/apps/openshift4-logging.yaml b/tests/golden/elasticsearch/openshift4-logging/apps/openshift4-logging.yaml deleted file mode 100644 index e69de29..0000000 diff --git a/tests/golden/elasticsearch/openshift4-logging/console-patching/openshift4_console_params.yaml b/tests/golden/elasticsearch/openshift4-logging/console-patching/openshift4_console_params.yaml deleted file mode 100644 index e69de29..0000000 diff --git a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/00_namespace.yaml b/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/00_namespace.yaml deleted file mode 100644 index 1b27cf9..0000000 --- a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/00_namespace.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - annotations: - openshift.io/node-selector: '' - labels: - name: openshift-logging - openshift.io/cluster-monitoring: 'true' - name: openshift-logging diff --git a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/10_operator_group.yaml b/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/10_operator_group.yaml deleted file mode 100644 index ff11675..0000000 --- a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/10_operator_group.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: operators.coreos.com/v1 -kind: OperatorGroup -metadata: - annotations: {} - labels: - name: cluster-logging - name: cluster-logging - namespace: openshift-logging -spec: - targetNamespaces: - - openshift-logging diff --git a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/20_subscriptions.yaml deleted file mode 100644 index f78b501..0000000 --- a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - annotations: {} - labels: - name: cluster-logging - name: cluster-logging - namespace: openshift-logging -spec: - channel: stable-5.9 - config: - resources: - limits: - memory: 256Mi - requests: - cpu: 10m - memory: 128Mi - installPlanApproval: Automatic - name: cluster-logging - source: redhat-operators - sourceNamespace: openshift-operators-redhat ---- -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - annotations: {} - labels: - name: elasticsearch-operator - name: elasticsearch-operator - namespace: openshift-operators-redhat -spec: - channel: stable-5.8 - config: - resources: - limits: - memory: 1.5Gi - requests: - cpu: 100m - memory: 1Gi - installPlanApproval: Automatic - name: elasticsearch-operator - source: openshift-operators-redhat - sourceNamespace: openshift-operators-redhat diff --git a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/30_cluster_logging.yaml b/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/30_cluster_logging.yaml deleted file mode 100644 index 6481164..0000000 --- a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/30_cluster_logging.yaml +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogging -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: instance - name: instance - namespace: openshift-logging -spec: - collection: - type: fluentd - logStore: - elasticsearch: - nodeCount: 3 - nodeSelector: - node-role.kubernetes.io/infra: '' - redundancyPolicy: SingleRedundancy - storage: - size: 200Gi - retentionPolicy: - application: - maxAge: 7d - pruneNamespacesInterval: 15m - audit: - maxAge: 30d - pruneNamespacesInterval: 15m - infra: - maxAge: 30d - pruneNamespacesInterval: 15m - type: elasticsearch - managementState: Managed - visualization: - kibana: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - type: kibana diff --git a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/40_es_machineconfig.yaml b/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/40_es_machineconfig.yaml deleted file mode 100644 index fa0c76d..0000000 --- a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/40_es_machineconfig.yaml +++ /dev/null @@ -1,39 +0,0 @@ -apiVersion: machineconfiguration.openshift.io/v1 -kind: MachineConfig -metadata: - annotations: {} - labels: - machineconfiguration.openshift.io/role: master - name: 40-master-journald - name: 40-master-journald -spec: - config: - ignition: - version: 2.2.0 - storage: - files: - - contents: - source: data:text/plain;charset=utf-8;base64,TWF4UmV0ZW50aW9uU2VjPTFtb250aApSYXRlTGltaXRCdXJzdD0xMDAwMApSYXRlTGltaXRJbnRlcnZhbD0xcwpTdG9yYWdlPXBlcnNpc3RlbnQKU3luY0ludGVydmFsU2VjPTFzCg== - filesystem: root - mode: 420 - path: /etc/systemd/journald.conf ---- -apiVersion: machineconfiguration.openshift.io/v1 -kind: MachineConfig -metadata: - annotations: {} - labels: - machineconfiguration.openshift.io/role: worker - name: 40-worker-journald - name: 40-worker-journald -spec: - config: - ignition: - version: 2.2.0 - storage: - files: - - contents: - source: data:text/plain;charset=utf-8;base64,TWF4UmV0ZW50aW9uU2VjPTFtb250aApSYXRlTGltaXRCdXJzdD0xMDAwMApSYXRlTGltaXRJbnRlcnZhbD0xcwpTdG9yYWdlPXBlcnNpc3RlbnQKU3luY0ludGVydmFsU2VjPTFzCg== - filesystem: root - mode: 420 - path: /etc/systemd/journald.conf diff --git a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/40_es_netpol.yaml b/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/40_es_netpol.yaml deleted file mode 100644 index f96b35f..0000000 --- a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/40_es_netpol.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - annotations: {} - labels: - name: allow-from-openshift-operators-redhat - name: allow-from-openshift-operators-redhat -spec: - ingress: - - from: - - namespaceSelector: - matchLabels: - name: openshift-operators-redhat - - podSelector: - matchLabels: - name: elasticsearch-operator - podSelector: {} - policyTypes: - - Ingress diff --git a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/60_collector_alerts.yaml deleted file mode 100644 index 268663f..0000000 --- a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ /dev/null @@ -1,127 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - annotations: {} - labels: - name: syn-collector-rules - name: syn-collector-rules - namespace: openshift-logging -spec: - groups: - - name: logging_collector.alerts - rules: - - alert: SYN_CollectorNodeDown - annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod - }} collector component for more than 10m. - summary: Collector cannot be scraped - expr: | - up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 - for: 10m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchDeprecation - annotations: - message: The OpenShift Elasticsearch Operator is deprecated and is planned - to be removed in a future release. Red Hat provides bug fixes and support - for this feature during the current release lifecycle, but this feature - no longer receives enhancements. As an alternative to using the OpenShift - Elasticsearch Operator to manage the default log storage, you can use - the Loki Operator. - summary: Detected Elasticsearch as the in-cluster storage which is deprecated - and will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: storage - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentdDeprecation - annotations: - message: Fluentd is deprecated and is planned to be removed in a future - release. Red Hat provides bug fixes and support for this feature during - the current release lifecycle, but this feature no longer receives enhancements. - As an alternative to Fluentd, you can use Vector instead. - summary: Detected Fluentd as the collector which is deprecated and will - be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_KibanaDeprecation - annotations: - message: The Kibana web console is now deprecated and is planned to be - removed in a future logging release. - summary: Detected Kibana as the visualization which is deprecated and - will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: visualization - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_DiskBufferUsage - annotations: - message: 'Collectors potentially consuming too much node disk, {{ $value - }}% ' - summary: Detected consuming too much node disk on $labels.hostname host - expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\ - \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\ - \ group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'}))\ - \ * 100 > 15\n" - for: 5m - labels: - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging diff --git a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml b/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml deleted file mode 100644 index ab921b9..0000000 --- a/tests/golden/elasticsearch/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml +++ /dev/null @@ -1,204 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - annotations: {} - labels: - name: syn-elasticsearch-logging-rules - name: syn-elasticsearch-logging-rules - namespace: openshift-logging -spec: - groups: - - name: logging_elasticsearch.alerts - rules: - - alert: SYN_ElasticsearchClusterNotHealthy - annotations: - message: Cluster {{ $labels.cluster }} health status has been RED for - at least 7m. Cluster does not accept writes, shards may be missing or - master node hasn't been elected yet. - runbook_url: https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md#Elasticsearch-Cluster-Health-is-Red - summary: Cluster health status is RED - expr: | - sum by (cluster) (es_cluster_status == 2) - for: 7m - labels: - namespace: openshift-logging - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchClusterNotHealthy - annotations: - message: Cluster {{ $labels.cluster }} health status has been YELLOW for - at least 20m. Some shard replicas are not allocated. - runbook_url: https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md#Elasticsearch-Cluster-Health-is-Yellow - summary: Cluster health status is YELLOW - expr: | - sum by (cluster) (es_cluster_status == 1) - for: 20m - labels: - namespace: openshift-logging - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchWriteRequestsRejectionJumps - annotations: - message: High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster - }} cluster. This node may not be keeping up with the indexing speed. - runbook_url: https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md#Elasticsearch-Write-Requests-Rejection-Jumps - summary: High Write Rejection Ratio - {{ $value }}% - expr: | - round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 - for: 10m - labels: - namespace: openshift-logging - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchNodeDiskWatermarkReached - annotations: - message: Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards - will be re-allocated to different nodes if possible. Make sure more - disk space is added to the node or drop old indices allocated to this - node. - runbook_url: https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md#Elasticsearch-Node-Disk-High-Watermark-Reached - summary: Disk High Watermark Reached - disk saturation is {{ $value }}% - expr: | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - for: 5m - labels: - namespace: openshift-logging - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchNodeDiskWatermarkReached - annotations: - message: Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every - index having a shard allocated on this node is enforced a read-only - block. The index block must be released manually when the disk utilization - falls below the high watermark. - runbook_url: https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md#Elasticsearch-Node-Disk-Flood-Watermark-Reached - summary: Disk Flood Stage Watermark Reached - disk saturation is {{ $value - }}% - expr: | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - for: 5m - labels: - namespace: openshift-logging - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchDiskSpaceRunningLow - annotations: - message: Cluster {{ $labels.cluster }} is predicted to be out of disk - space within the next 6h. - runbook_url: https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md#Elasticsearch-Disk-Space-is-Running-Low - summary: Cluster low on disk space - expr: | - sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 - for: 1h - labels: - namespace: openshift-logging - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchNodeDiskWatermarkReached - annotations: - message: Disk Low Watermark is predicted to be reached within the next - 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node - anymore. You should consider adding more disk to the node. - runbook_url: https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md#Elasticsearch-Node-Disk-Low-Watermark-Reached - summary: Disk Low Watermark is predicted to be reached within next 6h. - expr: | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - for: 1h - labels: - namespace: openshift-logging - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchNodeDiskWatermarkReached - annotations: - message: Disk High Watermark is predicted to be reached within the next - 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different - nodes if possible. Make sure more disk space is added to the node or - drop old indices allocated to this node. - runbook_url: https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md#Elasticsearch-Node-Disk-High-Watermark-Reached - summary: Disk High Watermark is predicted to be reached within next 6h. - expr: | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - for: 1h - labels: - namespace: openshift-logging - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchNodeDiskWatermarkReached - annotations: - message: Disk Flood Stage Watermark is predicted to be reached within - the next 6h at {{ $labels.pod }}. Every index having a shard allocated - on this node is enforced a read-only block. The index block must be - released manually when the disk utilization falls below the high watermark. - runbook_url: https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md#Elasticsearch-Node-Disk-Flood-Watermark-Reached - summary: Disk Flood Stage Watermark is predicted to be reached within - next 6h. - expr: | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - for: 1h - labels: - namespace: openshift-logging - severity: warning - syn: 'true' - syn_component: openshift4-logging - - name: elasticsearch_node_storage.alerts - rules: - - alert: SYN_ElasticsearchExpectNodeToReachDiskWatermark - annotations: - message: Expecting to reach disk low watermark at {{ $labels.node }} node - in {{ $labels.cluster }} cluster in 72 hours. When reaching the watermark - no new shards will be allocated to this node anymore. You should consider - adding more disk to the node. - runbook_url: https://hub.syn.tools/openshift4-logging/runbooks/SYN_ElasticsearchExpectNodeToReachDiskWatermark.html - summary: Expecting to Reach Disk Low Watermark in 72 Hours - expr: | - sum by(cluster, instance, node) ( - (1 - (predict_linear(es_fs_path_available_bytes[72h], 259200) / es_fs_path_total_bytes)) * 100 - ) > 85 - for: 6h - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging diff --git a/tests/golden/forwardingonly/openshift4-logging/apps/openshift4-logging.yaml b/tests/golden/forwardingonly/openshift4-logging/apps/openshift4-logging.yaml index e69de29..6825b97 100644 --- a/tests/golden/forwardingonly/openshift4-logging/apps/openshift4-logging.yaml +++ b/tests/golden/forwardingonly/openshift4-logging/apps/openshift4-logging.yaml @@ -0,0 +1,4 @@ +spec: + syncPolicy: + syncOptions: + - ServerSideApply=true diff --git a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/00_namespace.yaml b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/00_namespace.yaml index 1b27cf9..4c91e3c 100644 --- a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/00_namespace.yaml +++ b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/00_namespace.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Namespace metadata: annotations: + argocd.argoproj.io/sync-wave: '-100' openshift.io/node-selector: '' labels: name: openshift-logging diff --git a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/10_operator_group.yaml b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/10_operator_group.yaml index ff11675..52f645d 100644 --- a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/10_operator_group.yaml +++ b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/10_operator_group.yaml @@ -1,11 +1,9 @@ apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-90' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging -spec: - targetNamespaces: - - openshift-logging diff --git a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/20_subscriptions.yaml index 2c47bfe..9e42a78 100644 --- a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/20_subscriptions.yaml @@ -1,13 +1,14 @@ apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging spec: - channel: stable-5.9 + channel: stable-6.1 config: resources: limits: diff --git a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/30_cluster_logging.yaml b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/30_cluster_logging.yaml deleted file mode 100644 index 5b5a28d..0000000 --- a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/30_cluster_logging.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogging -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: instance - name: instance - namespace: openshift-logging -spec: - collection: - type: vector - managementState: Managed diff --git a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/60_collector_alerts.yaml index 268663f..4d23850 100644 --- a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -1,7 +1,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - annotations: {} labels: name: syn-collector-rules name: syn-collector-rules @@ -12,7 +11,7 @@ spec: rules: - alert: SYN_CollectorNodeDown annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m. summary: Collector cannot be scraped expr: | @@ -23,97 +22,10 @@ spec: severity: critical syn: 'true' syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchDeprecation - annotations: - message: The OpenShift Elasticsearch Operator is deprecated and is planned - to be removed in a future release. Red Hat provides bug fixes and support - for this feature during the current release lifecycle, but this feature - no longer receives enhancements. As an alternative to using the OpenShift - Elasticsearch Operator to manage the default log storage, you can use - the Loki Operator. - summary: Detected Elasticsearch as the in-cluster storage which is deprecated - and will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: storage - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentdDeprecation - annotations: - message: Fluentd is deprecated and is planned to be removed in a future - release. Red Hat provides bug fixes and support for this feature during - the current release lifecycle, but this feature no longer receives enhancements. - As an alternative to Fluentd, you can use Vector instead. - summary: Detected Fluentd as the collector which is deprecated and will - be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_KibanaDeprecation - annotations: - message: The Kibana web console is now deprecated and is planned to be - removed in a future logging release. - summary: Detected Kibana as the visualization which is deprecated and - will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: visualization - severity: Warning - syn: 'true' - syn_component: openshift4-logging - alert: SYN_DiskBufferUsage annotations: - message: 'Collectors potentially consuming too much node disk, {{ $value - }}% ' + description: 'Collectors potentially consuming too much node disk, {{ + $value }}% ' summary: Detected consuming too much node disk on $labels.hostname host expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\ \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\ diff --git a/tests/golden/legacy/openshift4-logging/apps/openshift4-logging.yaml b/tests/golden/legacy/openshift4-logging/apps/openshift4-logging.yaml deleted file mode 100644 index e69de29..0000000 diff --git a/tests/golden/legacy/openshift4-logging/console-patching/openshift4_console_params.yaml b/tests/golden/legacy/openshift4-logging/console-patching/openshift4_console_params.yaml deleted file mode 100644 index f71555a..0000000 --- a/tests/golden/legacy/openshift4-logging/console-patching/openshift4_console_params.yaml +++ /dev/null @@ -1,3 +0,0 @@ -config: - plugins: - - logging-view-plugin diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/00_namespace.yaml b/tests/golden/legacy/openshift4-logging/openshift4-logging/00_namespace.yaml deleted file mode 100644 index 1b27cf9..0000000 --- a/tests/golden/legacy/openshift4-logging/openshift4-logging/00_namespace.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - annotations: - openshift.io/node-selector: '' - labels: - name: openshift-logging - openshift.io/cluster-monitoring: 'true' - name: openshift-logging diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/10_operator_group.yaml b/tests/golden/legacy/openshift4-logging/openshift4-logging/10_operator_group.yaml deleted file mode 100644 index ff11675..0000000 --- a/tests/golden/legacy/openshift4-logging/openshift4-logging/10_operator_group.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: operators.coreos.com/v1 -kind: OperatorGroup -metadata: - annotations: {} - labels: - name: cluster-logging - name: cluster-logging - namespace: openshift-logging -spec: - targetNamespaces: - - openshift-logging diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/legacy/openshift4-logging/openshift4-logging/20_subscriptions.yaml deleted file mode 100644 index 1f0b7ad..0000000 --- a/tests/golden/legacy/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - annotations: {} - labels: - name: cluster-logging - name: cluster-logging - namespace: openshift-logging -spec: - channel: stable-5.9 - config: - resources: - limits: - memory: 256Mi - requests: - cpu: 10m - memory: 128Mi - installPlanApproval: Automatic - name: cluster-logging - source: redhat-operators - sourceNamespace: openshift-operators-redhat ---- -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - annotations: {} - labels: - name: loki-operator - name: loki-operator - namespace: openshift-operators-redhat -spec: - channel: stable-5.9 - config: - resources: - limits: - memory: 512Mi - requests: - cpu: 50m - memory: 381Mi - installPlanApproval: Automatic - name: loki-operator - source: openshift-operators-redhat - sourceNamespace: openshift-operators-redhat diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/30_cluster_logging.yaml b/tests/golden/legacy/openshift4-logging/openshift4-logging/30_cluster_logging.yaml deleted file mode 100644 index 307f0ca..0000000 --- a/tests/golden/legacy/openshift4-logging/openshift4-logging/30_cluster_logging.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogging -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: instance - name: instance - namespace: openshift-logging -spec: - collection: - type: vector - logStore: - lokistack: - name: loki - type: lokistack - managementState: Managed diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml b/tests/golden/legacy/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml deleted file mode 100644 index 69c1e22..0000000 --- a/tests/golden/legacy/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogForwarder -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: instance - name: instance - namespace: openshift-logging -spec: - inputs: - - application: - namespaces: - - app-one - - app-two - name: my-apps - outputs: - - name: custom-forwarder - type: syslog - - elasticsearch: - version: 8 - name: my-other-forwarder - type: elasticsearch - pipelines: - - inputRefs: - - application - name: application-logs - outputRefs: - - my-other-forwarder - - default - - my-forwarder - parse: json - - detectMultilineErrors: true - inputRefs: - - infrastructure - name: infrastructure-logs - outputRefs: - - default - parse: json - - inputRefs: - - my-apps - name: my-apps - outputRefs: - - custom-forwarder - parse: json diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_stack.yaml b/tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_stack.yaml deleted file mode 100644 index 259068c..0000000 --- a/tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_stack.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: loki.grafana.com/v1 -kind: LokiStack -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: loki - name: loki -spec: - limits: - global: - ingestion: - ingestionBurstSize: 9 - ingestionRate: 5 - size: 1x.demo - storage: - schemas: - - effectiveDate: '2022-06-01' - version: v12 - - effectiveDate: '2024-09-01' - version: v13 - secret: - name: loki-logstore - type: s3 - storageClassName: '' - template: - compactor: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 1 - distributor: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - gateway: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - indexGateway: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - ingester: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - querier: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - queryFrontend: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - ruler: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 1 - tenants: - mode: openshift-logging diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/legacy/openshift4-logging/openshift4-logging/60_collector_alerts.yaml deleted file mode 100644 index 268663f..0000000 --- a/tests/golden/legacy/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ /dev/null @@ -1,127 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - annotations: {} - labels: - name: syn-collector-rules - name: syn-collector-rules - namespace: openshift-logging -spec: - groups: - - name: logging_collector.alerts - rules: - - alert: SYN_CollectorNodeDown - annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod - }} collector component for more than 10m. - summary: Collector cannot be scraped - expr: | - up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 - for: 10m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchDeprecation - annotations: - message: The OpenShift Elasticsearch Operator is deprecated and is planned - to be removed in a future release. Red Hat provides bug fixes and support - for this feature during the current release lifecycle, but this feature - no longer receives enhancements. As an alternative to using the OpenShift - Elasticsearch Operator to manage the default log storage, you can use - the Loki Operator. - summary: Detected Elasticsearch as the in-cluster storage which is deprecated - and will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: storage - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentdDeprecation - annotations: - message: Fluentd is deprecated and is planned to be removed in a future - release. Red Hat provides bug fixes and support for this feature during - the current release lifecycle, but this feature no longer receives enhancements. - As an alternative to Fluentd, you can use Vector instead. - summary: Detected Fluentd as the collector which is deprecated and will - be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_KibanaDeprecation - annotations: - message: The Kibana web console is now deprecated and is planned to be - removed in a future logging release. - summary: Detected Kibana as the visualization which is deprecated and - will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: visualization - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_DiskBufferUsage - annotations: - message: 'Collectors potentially consuming too much node disk, {{ $value - }}% ' - summary: Detected consuming too much node disk on $labels.hostname host - expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\ - \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\ - \ group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'}))\ - \ * 100 > 15\n" - for: 5m - labels: - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml deleted file mode 100644 index 65a573e..0000000 --- a/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ /dev/null @@ -1,225 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - annotations: {} - labels: - name: syn-loki-logging-rules - name: syn-loki-logging-rules - namespace: openshift-logging -spec: - groups: - - name: logging_loki.alerts - rules: - - alert: SYN_LokiRequestErrors - annotations: - message: '{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf - "%.2f" $value }}% errors.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Request-Errors - summary: At least 10% of requests are responded by 5xx server errors. - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code=~"5.."} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiStackWriteRequestErrors - annotations: - message: '{{ printf "%.2f" $value }}% of write requests from {{ $labels.job - }} in {{ $labels.namespace }} are returned with server errors.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#LokiStack-Write-Request-Errors - summary: At least 10% of write requests to the lokistack-gateway are responded - with 5xx server errors. - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler="push"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler="push"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiStackReadRequestErrors - annotations: - message: '{{ printf "%.2f" $value }}% of query requests from {{ $labels.job - }} in {{ $labels.namespace }} are returned with server errors.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#LokiStack-Read-Request-Errors - summary: At least 10% of query requests to the lokistack-gateway are responded - with 5xx server errors. - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiRequestPanics - annotations: - message: '{{ $labels.job }} is experiencing an increase of {{ $value }} - panics.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Request-Panics - summary: A panic was triggered. - expr: | - sum( - increase( - loki_panic_total[10m] - ) - ) by (job, namespace) - > 0 - labels: - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiRequestLatency - annotations: - message: '{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf - "%.2f" $value }}s 99th percentile latency.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Request-Latency - summary: The 99th percentile is experiencing high latency (higher than - 1 second). - expr: | - histogram_quantile(0.99, - sum( - irate( - loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[1m] - ) - ) by (job, le, namespace, route) - ) - > 1 - for: 15m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiTenantRateLimit - annotations: - message: '{{ $labels.job }} {{ $labels.route }} is experiencing 429 errors.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Tenant-Rate-Limit - summary: At least 10% of requests are responded with the rate limit error - code. - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429"} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiStorageSlowWrite - annotations: - message: The storage path is experiencing slow write response rates. - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Storage-Slow-Write - summary: The storage path is experiencing slow write response rates. - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="WRITE"} - ) by (job, le, namespace) - ) - > 1 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiStorageSlowRead - annotations: - message: The storage path is experiencing slow read response rates. - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Storage-Slow-Read - summary: The storage path is experiencing slow read response rates. - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="Shipper.Query"} - ) by (job, le, namespace) - ) - > 5 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiWritePathHighLoad - annotations: - message: The write path is experiencing high load. - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Write-Path-High-Load - summary: The write path is experiencing high load, causing backpressure - storage flushing. - expr: | - sum( - loki_ingester_wal_replay_flushing - ) by (job, namespace) - > 0 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiReadPathHighLoad - annotations: - message: The read path is experiencing high load. - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Read-Path-High-Load - summary: The read path has high volume of queries, causing longer response - times. - expr: | - histogram_quantile(0.99, - sum( - rate( - loki_logql_querystats_latency_seconds_bucket[5m] - ) - ) by (job, le, namespace) - ) - > 30 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokistackSchemaUpgradesRequired - annotations: - message: |- - The LokiStack "{{ $labels.stack_name }}" in namespace "{{ $labels.stack_namespace }}" is using a storage schema - configuration that does not contain the latest schema version. It is recommended to update the schema - configuration to update the schema version to the latest version in the future. - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Lokistack-Schema-Upgrades-Required - summary: One or more of the deployed LokiStacks contains an outdated storage - schema configuration. - expr: | - sum ( - lokistack_status_condition{reason="StorageNeedsSchemaUpdate",status="true"} - ) by (stack_namespace, stack_name) - > 0 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging diff --git a/tests/golden/lokistack/openshift4-logging/apps/openshift4-logging.yaml b/tests/golden/lokistack/openshift4-logging/apps/openshift4-logging.yaml deleted file mode 100644 index e69de29..0000000 diff --git a/tests/golden/lokistack/openshift4-logging/console-patching/openshift4_console_params.yaml b/tests/golden/lokistack/openshift4-logging/console-patching/openshift4_console_params.yaml deleted file mode 100644 index f71555a..0000000 --- a/tests/golden/lokistack/openshift4-logging/console-patching/openshift4_console_params.yaml +++ /dev/null @@ -1,3 +0,0 @@ -config: - plugins: - - logging-view-plugin diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/00_namespace.yaml b/tests/golden/lokistack/openshift4-logging/openshift4-logging/00_namespace.yaml deleted file mode 100644 index 1b27cf9..0000000 --- a/tests/golden/lokistack/openshift4-logging/openshift4-logging/00_namespace.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - annotations: - openshift.io/node-selector: '' - labels: - name: openshift-logging - openshift.io/cluster-monitoring: 'true' - name: openshift-logging diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/10_operator_group.yaml b/tests/golden/lokistack/openshift4-logging/openshift4-logging/10_operator_group.yaml deleted file mode 100644 index ff11675..0000000 --- a/tests/golden/lokistack/openshift4-logging/openshift4-logging/10_operator_group.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: operators.coreos.com/v1 -kind: OperatorGroup -metadata: - annotations: {} - labels: - name: cluster-logging - name: cluster-logging - namespace: openshift-logging -spec: - targetNamespaces: - - openshift-logging diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/lokistack/openshift4-logging/openshift4-logging/20_subscriptions.yaml deleted file mode 100644 index 2c1caee..0000000 --- a/tests/golden/lokistack/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - annotations: {} - labels: - name: cluster-logging - name: cluster-logging - namespace: openshift-logging -spec: - channel: stable-5.8 - config: - resources: - limits: - memory: 256Mi - requests: - cpu: 10m - memory: 128Mi - installPlanApproval: Automatic - name: cluster-logging - source: redhat-operators - sourceNamespace: openshift-operators-redhat ---- -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - annotations: {} - labels: - name: loki-operator - name: loki-operator - namespace: openshift-operators-redhat -spec: - channel: stable-5.8 - config: - resources: - limits: - memory: 512Mi - requests: - cpu: 50m - memory: 381Mi - installPlanApproval: Automatic - name: loki-operator - source: openshift-operators-redhat - sourceNamespace: openshift-operators-redhat diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/30_cluster_logging.yaml b/tests/golden/lokistack/openshift4-logging/openshift4-logging/30_cluster_logging.yaml deleted file mode 100644 index 2ef6ec2..0000000 --- a/tests/golden/lokistack/openshift4-logging/openshift4-logging/30_cluster_logging.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogging -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: instance - name: instance - namespace: openshift-logging -spec: - collection: - type: fluentd - logStore: - lokistack: - name: loki - retentionPolicy: - application: - maxAge: 7d - pruneNamespacesInterval: 15m - audit: - maxAge: 30d - pruneNamespacesInterval: 15m - infra: - maxAge: 30d - pruneNamespacesInterval: 15m - type: lokistack - managementState: Managed diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_stack.yaml b/tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_stack.yaml deleted file mode 100644 index d61e352..0000000 --- a/tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_stack.yaml +++ /dev/null @@ -1,58 +0,0 @@ -apiVersion: loki.grafana.com/v1 -kind: LokiStack -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: loki - name: loki -spec: - limits: - global: - ingestion: - ingestionBurstSize: 9 - ingestionRate: 5 - size: 1x.demo - storage: - schemas: - - effectiveDate: '2022-06-01' - version: v12 - secret: - name: loki-logstore - type: s3 - storageClassName: '' - template: - compactor: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 1 - distributor: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - gateway: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - indexGateway: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - ingester: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - querier: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - queryFrontend: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 2 - ruler: - nodeSelector: - node-role.kubernetes.io/infra: '' - replicas: 1 - tenants: - mode: openshift-logging diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/lokistack/openshift4-logging/openshift4-logging/60_collector_alerts.yaml deleted file mode 100644 index 11efa15..0000000 --- a/tests/golden/lokistack/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ /dev/null @@ -1,61 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - annotations: {} - labels: - name: syn-collector-rules - name: syn-collector-rules - namespace: openshift-logging -spec: - groups: - - name: logging_collector.alerts - rules: - - alert: SYN_CollectorNodeDown - annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod - }} collector component for more than 10m. - summary: Collector cannot be scraped - expr: | - up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 - for: 10m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/lokistack/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml deleted file mode 100644 index f56e623..0000000 --- a/tests/golden/lokistack/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ /dev/null @@ -1,206 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - annotations: {} - labels: - name: syn-loki-logging-rules - name: syn-loki-logging-rules - namespace: openshift-logging -spec: - groups: - - name: logging_loki.alerts - rules: - - alert: SYN_LokiRequestErrors - annotations: - message: '{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf - "%.2f" $value }}% errors.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Request-Errors - summary: At least 10% of requests are responded by 5xx server errors. - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code=~"5.."} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiStackWriteRequestErrors - annotations: - message: '{{ printf "%.2f" $value }}% of write requests from {{ $labels.job - }} in {{ $labels.namespace }} are returned with server errors.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#LokiStack-Write-Request-Errors - summary: At least 10% of write requests to the lokistack-gateway are responded - with 5xx server errors. - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler="push"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler="push"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiStackReadRequestErrors - annotations: - message: '{{ printf "%.2f" $value }}% of query requests from {{ $labels.job - }} in {{ $labels.namespace }} are returned with server errors.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#LokiStack-Read-Request-Errors - summary: At least 10% of query requests to the lokistack-gateway are responded - with 5xx server errors. - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiRequestPanics - annotations: - message: '{{ $labels.job }} is experiencing an increase of {{ $value }} - panics.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Request-Panics - summary: A panic was triggered. - expr: | - sum( - increase( - loki_panic_total[10m] - ) - ) by (job, namespace) - > 0 - labels: - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiRequestLatency - annotations: - message: '{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf - "%.2f" $value }}s 99th percentile latency.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Request-Latency - summary: The 99th percentile is experiencing high latency (higher than - 1 second). - expr: | - histogram_quantile(0.99, - sum( - irate( - loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[1m] - ) - ) by (job, le, namespace, route) - ) - > 1 - for: 15m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiTenantRateLimit - annotations: - message: '{{ $labels.job }} {{ $labels.route }} is experiencing 429 errors.' - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Tenant-Rate-Limit - summary: At least 10% of requests are responded with the rate limit error - code. - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429"} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiStorageSlowWrite - annotations: - message: The storage path is experiencing slow write response rates. - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Storage-Slow-Write - summary: The storage path is experiencing slow write response rates. - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="WRITE"} - ) by (job, le, namespace) - ) - > 1 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiStorageSlowRead - annotations: - message: The storage path is experiencing slow read response rates. - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Storage-Slow-Read - summary: The storage path is experiencing slow read response rates. - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="Shipper.Query"} - ) by (job, le, namespace) - ) - > 5 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiWritePathHighLoad - annotations: - message: The write path is experiencing high load. - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Write-Path-High-Load - summary: The write path is experiencing high load, causing backpressure - storage flushing. - expr: | - sum( - loki_ingester_wal_replay_flushing - ) by (job, namespace) - > 0 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_LokiReadPathHighLoad - annotations: - message: The read path is experiencing high load. - runbook_url: https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md#Loki-Read-Path-High-Load - summary: The read path has high volume of queries, causing longer response - times. - expr: | - histogram_quantile(0.99, - sum( - rate( - loki_logql_querystats_latency_seconds_bucket[5m] - ) - ) by (job, le, namespace) - ) - > 30 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-logging diff --git a/tests/golden/master/openshift4-logging/apps/openshift4-logging.yaml b/tests/golden/master/openshift4-logging/apps/openshift4-logging.yaml index e69de29..6825b97 100644 --- a/tests/golden/master/openshift4-logging/apps/openshift4-logging.yaml +++ b/tests/golden/master/openshift4-logging/apps/openshift4-logging.yaml @@ -0,0 +1,4 @@ +spec: + syncPolicy: + syncOptions: + - ServerSideApply=true diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/00_namespace.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/00_namespace.yaml index 1b27cf9..4c91e3c 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/00_namespace.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/00_namespace.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Namespace metadata: annotations: + argocd.argoproj.io/sync-wave: '-100' openshift.io/node-selector: '' labels: name: openshift-logging diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/10_operator_group.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/10_operator_group.yaml index b72498d..52f645d 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/10_operator_group.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/10_operator_group.yaml @@ -1,9 +1,9 @@ apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-90' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging -spec: {} diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/20_subscriptions.yaml index 750966b..2ec4ac9 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/20_subscriptions.yaml @@ -1,7 +1,8 @@ apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: cluster-logging name: cluster-logging @@ -23,7 +24,8 @@ spec: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: loki-operator name: loki-operator @@ -41,3 +43,19 @@ spec: name: loki-operator source: openshift-operators-redhat sourceNamespace: openshift-operators-redhat +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-80' + labels: + name: cluster-observability-operator + name: cluster-observability-operator + namespace: openshift-operators-redhat +spec: + channel: development + installPlanApproval: Automatic + name: cluster-observability-operator + source: openshift-operators-redhat + sourceNamespace: openshift-operators-redhat diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/30_cluster_logging.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/30_cluster_logging.yaml deleted file mode 100644 index 307f0ca..0000000 --- a/tests/golden/master/openshift4-logging/openshift4-logging/30_cluster_logging.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogging -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: instance - name: instance - namespace: openshift-logging -spec: - collection: - type: vector - logStore: - lokistack: - name: loki - type: lokistack - managementState: Managed diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_logstore.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/30_loki_logstore.yaml similarity index 100% rename from tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_logstore.yaml rename to tests/golden/master/openshift4-logging/openshift4-logging/30_loki_logstore.yaml diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_netpol.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/30_loki_netpol.yaml similarity index 100% rename from tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_netpol.yaml rename to tests/golden/master/openshift4-logging/openshift4-logging/30_loki_netpol.yaml diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/30_loki_plugin.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/30_loki_plugin.yaml new file mode 100644 index 0000000..3128c2f --- /dev/null +++ b/tests/golden/master/openshift4-logging/openshift4-logging/30_loki_plugin.yaml @@ -0,0 +1,13 @@ +apiVersion: observability.openshift.io/v1alpha1 +kind: UIPlugin +metadata: + labels: + name: logging + name: logging +spec: + logging: + logsLimit: 50 + lokiStack: + name: loki + timeout: 30s + type: Logging diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_rbac.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/30_loki_rbac.yaml similarity index 100% rename from tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_rbac.yaml rename to tests/golden/master/openshift4-logging/openshift4-logging/30_loki_rbac.yaml diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_stack.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/30_loki_stack.yaml similarity index 97% rename from tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_stack.yaml rename to tests/golden/master/openshift4-logging/openshift4-logging/30_loki_stack.yaml index 259068c..f859742 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_stack.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/30_loki_stack.yaml @@ -3,6 +3,7 @@ kind: LokiStack metadata: annotations: argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + argocd.argoproj.io/sync-wave: '-50' labels: name: loki name: loki diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/32_namespace_logforwarding.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/32_namespace_logforwarding.yaml deleted file mode 100644 index f9b4483..0000000 --- a/tests/golden/master/openshift4-logging/openshift4-logging/32_namespace_logforwarding.yaml +++ /dev/null @@ -1,50 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogForwarder -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: bar - name: bar - namespace: foo -spec: - inputs: - - application: - namespaces: - - app-one - - app-two - name: my-apps - outputs: - - name: custom-forwarder - type: syslog - pipelines: - - inputRefs: - - my-apps - name: my-apps - outputRefs: - - custom-forwarder - serviceAccountName: ueli ---- -apiVersion: logging.openshift.io/v1 -kind: ClusterLogForwarder -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: hands - name: hands - namespace: jazz -spec: - outputs: - - name: splunk-forwarder - secret: - name: splunk-forwarder - type: fluentdForward - url: tls://splunk-forwarder:24224 - pipelines: - - inputRefs: - - application - name: application-logs - outputRefs: - - splunk-forwarder - serviceAccountName: hands diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/32_namespace_rolebinding.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/32_namespace_rolebinding.yaml deleted file mode 100644 index 14a605e..0000000 --- a/tests/golden/master/openshift4-logging/openshift4-logging/32_namespace_rolebinding.yaml +++ /dev/null @@ -1,33 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - annotations: {} - labels: - name: ueli - name: ueli - namespace: foo -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: collect-application-logs -subjects: - - kind: ServiceAccount - name: ueli - namespace: foo ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - annotations: {} - labels: - name: hands - name: hands - namespace: jazz -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: collect-application-logs -subjects: - - kind: ServiceAccount - name: hands - namespace: jazz diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/32_namespace_serviceaccount.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/32_namespace_serviceaccount.yaml deleted file mode 100644 index fc3c944..0000000 --- a/tests/golden/master/openshift4-logging/openshift4-logging/32_namespace_serviceaccount.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - annotations: {} - labels: - name: ueli - name: ueli - namespace: foo ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - annotations: {} - labels: - name: hands - name: hands - namespace: jazz diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/40_log_forwarder.yaml similarity index 76% rename from tests/golden/master/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml rename to tests/golden/master/openshift4-logging/openshift4-logging/40_log_forwarder.yaml index 6ed51d1..60b8ca2 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/40_log_forwarder.yaml @@ -1,4 +1,4 @@ -apiVersion: logging.openshift.io/v1 +apiVersion: observability.openshift.io/v1 kind: ClusterLogForwarder metadata: annotations: @@ -8,12 +8,21 @@ metadata: name: instance namespace: openshift-logging spec: + collector: + resources: + requests: + cpu: 20m + memory: 400M + tolerations: + - key: storagenode + operator: Exists inputs: - application: namespaces: - app-one - app-two name: my-apps + managementState: Managed outputs: - name: custom-forwarder type: syslog @@ -40,3 +49,5 @@ spec: outputRefs: - custom-forwarder parse: json + serviceAccount: + name: logcollector diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/40_log_forwarder_rbac.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/40_log_forwarder_rbac.yaml new file mode 100644 index 0000000..cac68e1 --- /dev/null +++ b/tests/golden/master/openshift4-logging/openshift4-logging/40_log_forwarder_rbac.yaml @@ -0,0 +1,63 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector + name: logcollector + namespace: openshift-logging +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector-application-logs + name: logcollector-application-logs + namespace: openshift-logging +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: collect-application-logs +subjects: + - kind: ServiceAccount + name: logcollector + namespace: openshift-logging +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector-infrastructure-logs + name: logcollector-infrastructure-logs + namespace: openshift-logging +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: collect-infrastructure-logs +subjects: + - kind: ServiceAccount + name: logcollector + namespace: openshift-logging +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector-audit-logs + name: logcollector-audit-logs + namespace: openshift-logging +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: collect-audit-logs +subjects: + - kind: ServiceAccount + name: logcollector + namespace: openshift-logging diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/50_fix_app_logs_reader.yaml similarity index 100% rename from tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml rename to tests/golden/master/openshift4-logging/openshift4-logging/50_fix_app_logs_reader.yaml diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/50_fix_ingester_stuck.yaml similarity index 100% rename from tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml rename to tests/golden/master/openshift4-logging/openshift4-logging/50_fix_ingester_stuck.yaml diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/50_fix_missing_metrics_token.yaml similarity index 100% rename from tests/golden/legacy/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml rename to tests/golden/master/openshift4-logging/openshift4-logging/50_fix_missing_metrics_token.yaml diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_logstore.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_logstore.yaml deleted file mode 100644 index 77d8c18..0000000 --- a/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_logstore.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -data: {} -kind: Secret -metadata: - annotations: {} - labels: - name: loki-logstore - name: loki-logstore -stringData: - access_key_id: '' - access_key_secret: '' - bucketnames: c-green-test-1234-logstore - endpoint: '' -type: Opaque diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_netpol.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_netpol.yaml deleted file mode 100644 index f2cd3bb..0000000 --- a/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_netpol.yaml +++ /dev/null @@ -1,54 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - annotations: {} - labels: - name: allow-console-logging-view-plugin - name: allow-console-logging-view-plugin -spec: - ingress: - - from: - - podSelector: - matchLabels: - app: console - component: ui - - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: openshift-console - ports: - - port: 9443 - protocol: TCP - podSelector: - matchLabels: - app.kubernetes.io/created-by: openshift-logging_instance - app.kubernetes.io/name: logging-view-plugin - policyTypes: - - Ingress ---- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - annotations: {} - labels: - name: allow-console-logging-lokistack-gateway - name: allow-console-logging-lokistack-gateway -spec: - ingress: - - from: - - podSelector: - matchLabels: - app: console - component: ui - - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: openshift-console - ports: - - port: 8080 - protocol: TCP - podSelector: - matchLabels: - app.kubernetes.io/component: lokistack-gateway - app.kubernetes.io/instance: loki - app.kubernetes.io/name: lokistack - policyTypes: - - Ingress diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_rbac.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_rbac.yaml deleted file mode 100644 index d5dde59..0000000 --- a/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_rbac.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - annotations: {} - labels: - name: syn-loki-cluster-reader - rbac.authorization.k8s.io/aggregate-to-cluster-reader: 'true' - name: syn:loki:cluster-reader -rules: - - apiGroups: - - loki.grafana.com - resourceNames: - - logs - resources: - - application - - infrastructure - verbs: - - get diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml index 19adca5..4d23850 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -1,7 +1,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - annotations: {} labels: name: syn-collector-rules name: syn-collector-rules @@ -12,7 +11,7 @@ spec: rules: - alert: SYN_CollectorNodeDown annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m. summary: Collector cannot be scraped expr: | @@ -23,97 +22,10 @@ spec: severity: critical syn: 'true' syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchDeprecation - annotations: - message: In Red Hat OpenShift Logging Operator 6.0, support for the Red - Hat Elasticsearch Operator has been removed. Bug fixes and support are - provided only through the end of the 5.9 lifecycle. As an alternative - to the Elasticsearch Operator, you can use the Loki Operator instead. - summary: Detected Elasticsearch as the in-cluster storage, which has been - removed in 6.0 release - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: storage - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentdDeprecation - annotations: - message: In Red Hat OpenShift Logging Operator 6.0, support for Fluentd - as a collector has been removed. Bug fixes and support are provided - only through the end of the 5.9 lifecycle. As an alternative to Fluentd, - you can use the Vector collector instead. - summary: Detected Fluentd as the collector, which has been removed in - a 6.0 release - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_KibanaDeprecation - annotations: - message: In Red Hat OpenShift Logging Operator 6.0, support for Kibana - as a data visualization dashboard has been removed. Bug fixes and support - are provided only through the end of the 5.9 lifecycle. As an alternative - to Kibana, you can use the Grafana Dashboard instead. - summary: Detected Kibana as the log data visualization, which has been - removed in the 6.0 release - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: visualization - severity: Warning - syn: 'true' - syn_component: openshift4-logging - alert: SYN_DiskBufferUsage annotations: - message: 'Collectors potentially consuming too much node disk, {{ $value - }}% ' + description: 'Collectors potentially consuming too much node disk, {{ + $value }}% ' summary: Detected consuming too much node disk on $labels.hostname host expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\ \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\ diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..614581b 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -1,7 +1,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - annotations: {} labels: name: syn-loki-logging-rules name: syn-loki-logging-rules @@ -204,6 +203,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/golden/multilineerr/openshift4-logging/apps/openshift4-logging.yaml b/tests/golden/multilineerr/openshift4-logging/apps/openshift4-logging.yaml index e69de29..6825b97 100644 --- a/tests/golden/multilineerr/openshift4-logging/apps/openshift4-logging.yaml +++ b/tests/golden/multilineerr/openshift4-logging/apps/openshift4-logging.yaml @@ -0,0 +1,4 @@ +spec: + syncPolicy: + syncOptions: + - ServerSideApply=true diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/00_namespace.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/00_namespace.yaml index 1b27cf9..4c91e3c 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/00_namespace.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/00_namespace.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Namespace metadata: annotations: + argocd.argoproj.io/sync-wave: '-100' openshift.io/node-selector: '' labels: name: openshift-logging diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/10_operator_group.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/10_operator_group.yaml index ff11675..52f645d 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/10_operator_group.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/10_operator_group.yaml @@ -1,11 +1,9 @@ apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-90' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging -spec: - targetNamespaces: - - openshift-logging diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml index 1f0b7ad..1abaf03 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml @@ -1,13 +1,14 @@ apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging spec: - channel: stable-5.9 + channel: stable-6.1 config: resources: limits: @@ -23,13 +24,14 @@ spec: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: loki-operator name: loki-operator namespace: openshift-operators-redhat spec: - channel: stable-5.9 + channel: stable-6.1 config: resources: limits: @@ -41,3 +43,19 @@ spec: name: loki-operator source: openshift-operators-redhat sourceNamespace: openshift-operators-redhat +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-80' + labels: + name: cluster-observability-operator + name: cluster-observability-operator + namespace: openshift-operators-redhat +spec: + channel: development + installPlanApproval: Automatic + name: cluster-observability-operator + source: openshift-operators-redhat + sourceNamespace: openshift-operators-redhat diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_cluster_logging.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_cluster_logging.yaml deleted file mode 100644 index 307f0ca..0000000 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_cluster_logging.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogging -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: instance - name: instance - namespace: openshift-logging -spec: - collection: - type: vector - logStore: - lokistack: - name: loki - type: lokistack - managementState: Managed diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_logstore.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_logstore.yaml similarity index 100% rename from tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_logstore.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_logstore.yaml diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_netpol.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_netpol.yaml similarity index 100% rename from tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_netpol.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_netpol.yaml diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_plugin.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_plugin.yaml new file mode 100644 index 0000000..3128c2f --- /dev/null +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_plugin.yaml @@ -0,0 +1,13 @@ +apiVersion: observability.openshift.io/v1alpha1 +kind: UIPlugin +metadata: + labels: + name: logging + name: logging +spec: + logging: + logsLimit: 50 + lokiStack: + name: loki + timeout: 30s + type: Logging diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_rbac.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_rbac.yaml similarity index 100% rename from tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_rbac.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_rbac.yaml diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_stack.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_stack.yaml similarity index 97% rename from tests/golden/master/openshift4-logging/openshift4-logging/50_loki_stack.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_stack.yaml index 259068c..f859742 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_stack.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_stack.yaml @@ -3,6 +3,7 @@ kind: LokiStack metadata: annotations: argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + argocd.argoproj.io/sync-wave: '-50' labels: name: loki name: loki diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/40_log_forwarder.yaml similarity index 65% rename from tests/golden/multilineerr/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/40_log_forwarder.yaml index 15009bd..df14feb 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/40_log_forwarder.yaml @@ -1,4 +1,4 @@ -apiVersion: logging.openshift.io/v1 +apiVersion: observability.openshift.io/v1 kind: ClusterLogForwarder metadata: annotations: @@ -8,6 +8,15 @@ metadata: name: instance namespace: openshift-logging spec: + collector: + resources: + requests: + cpu: 20m + memory: 400M + tolerations: + - key: storagenode + operator: Exists + managementState: Managed pipelines: - detectMultilineErrors: true inputRefs: @@ -21,3 +30,5 @@ spec: name: infrastructure-logs outputRefs: - default + serviceAccount: + name: logcollector diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/40_log_forwarder_rbac.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/40_log_forwarder_rbac.yaml new file mode 100644 index 0000000..cac68e1 --- /dev/null +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/40_log_forwarder_rbac.yaml @@ -0,0 +1,63 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector + name: logcollector + namespace: openshift-logging +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector-application-logs + name: logcollector-application-logs + namespace: openshift-logging +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: collect-application-logs +subjects: + - kind: ServiceAccount + name: logcollector + namespace: openshift-logging +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector-infrastructure-logs + name: logcollector-infrastructure-logs + namespace: openshift-logging +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: collect-infrastructure-logs +subjects: + - kind: ServiceAccount + name: logcollector + namespace: openshift-logging +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + argocd.argoproj.io/sync-wave: '-50' + labels: + name: logcollector-audit-logs + name: logcollector-audit-logs + namespace: openshift-logging +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: collect-audit-logs +subjects: + - kind: ServiceAccount + name: logcollector + namespace: openshift-logging diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_app_logs_reader.yaml similarity index 100% rename from tests/golden/master/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_app_logs_reader.yaml diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_ingester_stuck.yaml similarity index 100% rename from tests/golden/master/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_ingester_stuck.yaml diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_missing_metrics_token.yaml similarity index 100% rename from tests/golden/master/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_missing_metrics_token.yaml diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml deleted file mode 100644 index dcca6fb..0000000 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml +++ /dev/null @@ -1,153 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - annotations: {} - labels: - name: loki-ingester-check - name: loki-ingester-check - namespace: openshift-logging ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - annotations: {} - labels: - name: loki-ingester-check - name: loki-ingester-check - namespace: openshift-logging -rules: - - apiGroups: - - '' - resources: - - pods - - pods/exec - verbs: - - get - - list - - watch - - create - - delete - - patch - - update ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - annotations: {} - labels: - name: loki-ingester-check - name: loki-ingester-check - namespace: openshift-logging -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: loki-ingester-check -subjects: - - kind: ServiceAccount - name: loki-ingester-check ---- -apiVersion: v1 -data: - wal-check.sh: | - #!/bin/bash - - set -e -o pipefail - - # Check if pod is in stuck state. - function check_pod() { - POD_NAME="loki-ingester-${1}" - echo "checking POD ${POD_NAME}" - PHASE=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.phase') - if [ ${PHASE} != "Running" ]; then - return 0 - fi - READY=$(kubectl -n openshift-logging get po ${POD_NAME} -oyaml | yq '.status.conditions[] | select(.type == "ContainersReady") | .status') - if [ ${READY} == "True" ]; then - return 0 - fi - return 1 - } - - # Check directories of pod and remove non-existing checkpoint if present. - function check_dir() { - shopt -s extglob - POD_NAME="loki-ingester-${1}" - echo "checking DIR ${POD_NAME}" - DIR_CHP=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^checkpoint\.[0-9]*$") - PATTERN=$(echo ${DIR_CHP} | sed 's/[^0-9]*//g') - DIR_WAL=$(kubectl -n openshift-logging exec -i ${POD_NAME} -- ls /tmp/wal | grep -o "^0*${PATTERN}$" || exit 0) - if [ -z $DIR_WAL ]; then - kubectl -n openshift-logging exec -i ${POD_NAME} -- rm -rf /tmp/wal/${DIR_CHP} - kubectl -n openshift-logging delete po ${POD_NAME} - fi - } - - # Check if pods are in stuck state for longer than ${SLEEP_TIME}. - # Only fix 1 pod at a time and immediatly exit if it is fixed. - function fix_pod() { - if ! check_pod $1; then - echo "stuck POD, waiting ${SLEEP_TIME}" - sleep ${SLEEP_TIME} - if ! check_pod $1; then - check_dir $1 - exit 0 - fi - fi - } - - fix_pod 0 - fix_pod 1 - - exit 0 -kind: ConfigMap -metadata: - annotations: {} - labels: - name: loki-ingester-check - name: loki-ingester-check - namespace: openshift-logging ---- -apiVersion: batch/v1 -kind: CronJob -metadata: - annotations: {} - labels: - name: loki-ingester-check - name: loki-ingester-check - namespace: openshift-logging -spec: - concurrencyPolicy: Forbid - failedJobsHistoryLimit: 0 - jobTemplate: - spec: - activeDeadlineSeconds: 360 - backoffLimit: 1 - template: - spec: - containers: - - command: - - /usr/local/bin/wal-check.sh - env: - - name: SLEEP_TIME - value: 2m - image: quay.io/appuio/oc:v4.14 - imagePullPolicy: IfNotPresent - name: check-pod - ports: [] - stdin: false - tty: false - volumeMounts: - - mountPath: /usr/local/bin/wal-check.sh - name: wal-check - readOnly: true - subPath: wal-check.sh - nodeSelector: - node-role.kubernetes.io/infra: '' - restartPolicy: Never - serviceAccountName: loki-ingester-check - volumes: - - configMap: - defaultMode: 364 - name: loki-ingester-check - name: wal-check - schedule: '*/10 * * * *' diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml deleted file mode 100644 index 5e7989d..0000000 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - annotations: {} - labels: - name: logging-application-logs-reader-aggregate - rbac.authorization.k8s.io/aggregate-to-admin: 'true' - name: logging-application-logs-reader-aggregate -rules: - - apiGroups: - - loki.grafana.com - resourceNames: - - logs - resources: - - application - verbs: - - get diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_logstore.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_logstore.yaml deleted file mode 100644 index 77d8c18..0000000 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_logstore.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -data: {} -kind: Secret -metadata: - annotations: {} - labels: - name: loki-logstore - name: loki-logstore -stringData: - access_key_id: '' - access_key_secret: '' - bucketnames: c-green-test-1234-logstore - endpoint: '' -type: Opaque diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_netpol.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_netpol.yaml deleted file mode 100644 index f2cd3bb..0000000 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_netpol.yaml +++ /dev/null @@ -1,54 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - annotations: {} - labels: - name: allow-console-logging-view-plugin - name: allow-console-logging-view-plugin -spec: - ingress: - - from: - - podSelector: - matchLabels: - app: console - component: ui - - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: openshift-console - ports: - - port: 9443 - protocol: TCP - podSelector: - matchLabels: - app.kubernetes.io/created-by: openshift-logging_instance - app.kubernetes.io/name: logging-view-plugin - policyTypes: - - Ingress ---- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - annotations: {} - labels: - name: allow-console-logging-lokistack-gateway - name: allow-console-logging-lokistack-gateway -spec: - ingress: - - from: - - podSelector: - matchLabels: - app: console - component: ui - - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: openshift-console - ports: - - port: 8080 - protocol: TCP - podSelector: - matchLabels: - app.kubernetes.io/component: lokistack-gateway - app.kubernetes.io/instance: loki - app.kubernetes.io/name: lokistack - policyTypes: - - Ingress diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml deleted file mode 100644 index 0b86fe6..0000000 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - annotations: - argocd.argoproj.io/sync-options: Prune=false,Delete=false - kubernetes.io/service-account.name: loki-operator-controller-manager-metrics-reader - labels: - name: loki-operator-controller-manager-metrics-token - name: loki-operator-controller-manager-metrics-token - namespace: openshift-operators-redhat -type: kubernetes.io/service-account-token diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_rbac.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_rbac.yaml deleted file mode 100644 index d5dde59..0000000 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_rbac.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - annotations: {} - labels: - name: syn-loki-cluster-reader - rbac.authorization.k8s.io/aggregate-to-cluster-reader: 'true' - name: syn:loki:cluster-reader -rules: - - apiGroups: - - loki.grafana.com - resourceNames: - - logs - resources: - - application - - infrastructure - verbs: - - get diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml index 268663f..4d23850 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -1,7 +1,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - annotations: {} labels: name: syn-collector-rules name: syn-collector-rules @@ -12,7 +11,7 @@ spec: rules: - alert: SYN_CollectorNodeDown annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m. summary: Collector cannot be scraped expr: | @@ -23,97 +22,10 @@ spec: severity: critical syn: 'true' syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchDeprecation - annotations: - message: The OpenShift Elasticsearch Operator is deprecated and is planned - to be removed in a future release. Red Hat provides bug fixes and support - for this feature during the current release lifecycle, but this feature - no longer receives enhancements. As an alternative to using the OpenShift - Elasticsearch Operator to manage the default log storage, you can use - the Loki Operator. - summary: Detected Elasticsearch as the in-cluster storage which is deprecated - and will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: storage - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentdDeprecation - annotations: - message: Fluentd is deprecated and is planned to be removed in a future - release. Red Hat provides bug fixes and support for this feature during - the current release lifecycle, but this feature no longer receives enhancements. - As an alternative to Fluentd, you can use Vector instead. - summary: Detected Fluentd as the collector which is deprecated and will - be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_KibanaDeprecation - annotations: - message: The Kibana web console is now deprecated and is planned to be - removed in a future logging release. - summary: Detected Kibana as the visualization which is deprecated and - will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: visualization - severity: Warning - syn: 'true' - syn_component: openshift4-logging - alert: SYN_DiskBufferUsage annotations: - message: 'Collectors potentially consuming too much node disk, {{ $value - }}% ' + description: 'Collectors potentially consuming too much node disk, {{ + $value }}% ' summary: Detected consuming too much node disk on $labels.hostname host expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\ \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\ diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..614581b 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -1,7 +1,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - annotations: {} labels: name: syn-loki-logging-rules name: syn-loki-logging-rules @@ -204,6 +203,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/legacy.yml b/tests/legacy.yml deleted file mode 100644 index 7d5565b..0000000 --- a/tests/legacy.yml +++ /dev/null @@ -1,57 +0,0 @@ -applications: - - openshift4-operators as openshift-operators-redhat - - openshift4-monitoring - -parameters: - kapitan: - dependencies: - - type: https - source: https://raw.githubusercontent.com/appuio/component-openshift4-operators/v1.0.2/lib/openshift4-operators.libsonnet - output_path: vendor/lib/openshift4-operators.libsonnet - - type: https - source: https://raw.githubusercontent.com/appuio/component-openshift4-monitoring/v2.9.0/lib/openshift4-monitoring-alert-patching.libsonnet - output_path: vendor/lib/alert-patching.libsonnet - compile: - - input_type: jsonnet - input_paths: - - tests/console-patch.jsonnet - output_path: console-patching/ - - openshift4_operators: - defaultInstallPlanApproval: Automatic - defaultSource: openshift-operators-redhat - defaultSourceNamespace: openshift-operators-redhat - - openshift4_logging: - clusterLogForwarding: - enabled: true - forwarders: - custom-forwarder: - type: syslog - my-other-forwarder: - type: elasticsearch - elasticsearch: - version: 8 - namespace_groups: - my-apps: - namespaces: - - app-one - - app-two - forwarders: - - custom-forwarder - json: true - json: - enabled: true - application_logs: - json: true - forwarders: - - my-other-forwarder - infrastructure_logs: - json: true - detectMultilineErrors: true - - clusterLogForwarder: - pipelines: - application-logs: - outputRefs: - - my-forwarder diff --git a/tests/master.yml b/tests/master.yml index d0afa9c..8d248eb 100644 --- a/tests/master.yml +++ b/tests/master.yml @@ -24,9 +24,12 @@ parameters: openshift4_logging: channel: 'stable' - alerts: 'master' clusterLogForwarder: + filters: + my-filter: + openshiftLabels: + cluster_id: ${cluster:name} inputs: my-apps: application: @@ -50,38 +53,8 @@ parameters: outputRefs: - custom-forwarder - namespaceLogForwarderEnabled: true - namespaceLogForwarder: - jazz/hands: - outputs: - splunk-forwarder: - secret: - name: splunk-forwarder - type: fluentdForward - url: tls://splunk-forwarder:24224 - pipelines: - application-logs: - inputRefs: - - application - outputRefs: - - splunk-forwarder - foo/bar: - serviceAccountName: ueli - inputs: - my-apps: - application: - namespaces: - - app-one - - app-two - outputs: - custom-forwarder: - type: syslog - pipelines: - my-apps: - inputRefs: - - my-apps - outputRefs: - - custom-forwarder + alerts: + release: 'master' secrets: my-secret: