From 0c7d061c7e29913e8bf7276655c72ad5854f31fd Mon Sep 17 00:00:00 2001 From: "lukasz.widera@vshn.ch" Date: Fri, 3 Nov 2023 14:20:18 +0100 Subject: [PATCH 01/11] adding memory and storage alerts for Redis alert --- component/component/common.libsonnet | 108 ++++++++ component/component/vshn_postgres.jsonnet | 230 ++++++---------- component/component/vshn_redis.jsonnet | 14 +- .../appcat/21_composition_vshn_postgres.yaml | 252 +++++++++--------- .../21_composition_vshn_postgresrestore.yaml | 252 +++++++++--------- .../appcat/21_composition_vshn_redis.yaml | 102 +++++++ 6 files changed, 566 insertions(+), 392 deletions(-) diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index 718d8a886..58ecbb229 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -173,6 +173,108 @@ local argoCDAnnotations() = { 'argocd.argoproj.io/sync-options': 'Prune=false', }; +local bottomPod(query) = 'label_replace( bottomk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; +local topPod(query) = 'label_replace( topk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; + +local generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts) = { + // standardized lowercase regardless of what came as input + local serviceNameLower = std.asciiLower(serviceName), + local toReplace = 'vshn-replacemeplease', + name: 'prometheusrule', + base: { + + apiVersion: 'kubernetes.crossplane.io/v1alpha1', + kind: 'Object', + metadata: { + name: 'prometheusrule', + }, + spec: { + providerConfigRef: { + name: 'kubernetes', + }, + forProvider: { + manifest: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + }, + spec+: { + forProvider+: { + manifest+: { + metadata: { + name: '%s-rules' % serviceName, + }, + spec: { + groups: [ + { + name: '%s-general-alerts' % serviceName, + rules: [ + { + name: '%s-storage' % serviceName, + local queries = { + availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', + availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, + usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', + unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', + }, + rules: + [ + { + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), + 'for': '1m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + { + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), + 'for': '1h', + labels: { + severity: 'warning', + }, + }, + ], + }, + { + alert: serviceName + 'MemoryCritical', + name: std.asciiLower(serviceName) + '-memory', + annotations: { + description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', + // runbook_url: 'TBD', + summary: 'Memory usage critical', + }, + expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 90') % memoryContainerName, toReplace, 'vshn-' + std.asciiLower(serviceName)), + 'for': '120m', + labels: { + severity: 'warning', + syn_team: 'schedar', + }, + }, + ] + additionalAlerts, + }, + ], + }, + }, + }, + }, + }, + }, + }, +}; + + { SyncOptions: syncOptions, VshnMetaDBaaSExoscale(dbname): @@ -203,4 +305,10 @@ local argoCDAnnotations() = { removeField(obj, name), ArgoCDAnnotations(): argoCDAnnotations(), + GeneratePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts): + generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts), + topPod(query): + topPod(query), + bottomPod(query): + bottomPod(query), } diff --git a/component/component/vshn_postgres.jsonnet b/component/component/vshn_postgres.jsonnet index a1fc80a9e..e7b37d56d 100644 --- a/component/component/vshn_postgres.jsonnet +++ b/component/component/vshn_postgres.jsonnet @@ -762,159 +762,91 @@ local clusterRestoreConfig = { ], }; -local prometheusRule = { - name: 'prometheusrule', - base: comp.KubeObject('monitoring.coreos.com/v1', 'PrometheusRule') + { - spec+: { - forProvider+: { - manifest+: { - metadata: { - name: 'postgresql-rules', + +local prometheusRule = common.GeneratePrometheusNonSLORules( + 'PostgreSQL', + 'patroni', + [ + { + name: 'postgresql-connections', + rules: [ + { + alert: 'PostgreSQLConnectionsCritical', + annotations: { + description: 'The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours.\n Please reduce the load of this instance.', + // runbook_url: 'TBD', + summary: 'Connection usage critical', }, - local bottomPod(query) = 'label_replace( bottomk(1, %s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+")' % query, - local topPod(query) = 'label_replace( topk(1, %s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+")' % query, - spec: { - groups: [ - { - name: 'postgresql-storage', - local queries = { - availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', - availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, - usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', - unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', - }, - rules: [ - { - alert: 'PostgreSQLPersistentVolumeFillingUp', - annotations: { - description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), - 'for': '1m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - { - alert: 'PostgreSQLPersistentVolumeFillingUp', - annotations: { - description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), - 'for': '1h', - labels: { - severity: 'warning', - }, - }, - ], - }, - { - name: 'postgresql-memory', - rules: [ - { - alert: 'PostgreSQLMemoryCritical', - annotations: { - description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', - // runbook_url: 'TBD', - summary: 'Memory usage critical', - }, - expr: topPod('(container_memory_working_set_bytes{container="patroni"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 85'), - 'for': '120m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - { - name: 'postgresql-connections', - rules: [ - { - alert: 'PostgreSQLConnectionsCritical', - annotations: { - description: 'The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours.\n Please reduce the load of this instance.', - // runbook_url: 'TBD', - summary: 'Connection usage critical', - }, - expr: topPod('sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) by (pod, namespace)'), - 'for': '120m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - // new - { - name: 'postgresql-replication', - rules: [ - { - alert: 'PostgreSQLReplicationCritical', - annotations: { - description: 'The number of replicas for the instance {{ $labels.cluster_name }} in namespace {{ $labels.namespace }}. Please check pod counts in affected namespace.', - runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical', - summary: 'Replication status check', - }, - expr: 'pg_replication_slots_active == 0', - 'for': '10m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - { - name: 'postgresql-replication-lag', - rules: [ - { - alert: 'PostgreSQLReplicationLagCritical', - annotations: { - description: 'Replication lag size on namespace {{$labels.exported_namespace}} instance ({{$labels.application_name}}) is currently {{ $value | humanize1024}}B behind the leader.', - runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical', - summary: 'Replication lag status check', - }, - expr: 'pg_replication_status_lag_size > 1e+09', - 'for': '5m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - { - name: 'postgresql-replication-count', - rules: [ - { - alert: 'PostgreSQLPodReplicasCritical', - annotations: { - description: 'Replication is broken in namespace {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}).', - runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical', - summary: 'Replication lag status check', - }, - expr: 'kube_statefulset_status_replicas_available{statefulset=~".+", namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"}', - 'for': '5m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - ], + + expr: std.strReplace(common.topPod('sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) by (pod, namespace)'), 'vshn-replacemeplease', 'vshn-' + std.asciiLower('PostgreSQL')), + 'for': '120m', + labels: { + severity: 'critical', + syn_team: 'schedar', }, }, - }, + ], }, - }, + // new + { + name: 'postgresql-replication', + rules: [ + { + alert: 'PostgreSQLReplicationCritical', + annotations: { + description: 'The number of replicas for the instance {{ $labels.cluster_name }} in namespace {{ $labels.namespace }}. Please check pod counts in affected namespace.', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical', + summary: 'Replication status check', + }, + expr: 'pg_replication_slots_active == 0', + 'for': '10m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + ], + }, + { + name: 'postgresql-replication-lag', + rules: [ + { + alert: 'PostgreSQLReplicationLagCritical', + annotations: { + description: 'Replication lag size on namespace {{$labels.exported_namespace}} instance ({{$labels.application_name}}) is currently {{ $value | humanize1024}}B behind the leader.', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical', + summary: 'Replication lag status check', + }, + expr: 'pg_replication_status_lag_size > 1e+09', + 'for': '5m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + ], + }, + { + name: 'postgresql-replication-count', + rules: [ + { + alert: 'PostgreSQLPodReplicasCritical', + annotations: { + description: 'Replication is broken in namespace {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}).', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical', + summary: 'Replication lag status check', + }, + expr: 'kube_statefulset_status_replicas_available{statefulset=~".+", namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"}', + 'for': '5m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + ], + }, + ] +) + { patches: [ comp.FromCompositeFieldPathWithTransformSuffix('metadata.labels[crossplane.io/composite]', 'metadata.name', 'prometheusrule'), comp.FromCompositeFieldPathWithTransformPrefix('metadata.labels[crossplane.io/composite]', 'spec.forProvider.manifest.metadata.namespace', 'vshn-postgresql'), diff --git a/component/component/vshn_redis.jsonnet b/component/component/vshn_redis.jsonnet index 6c2851c1b..ae86753cd 100644 --- a/component/component/vshn_redis.jsonnet +++ b/component/component/vshn_redis.jsonnet @@ -386,6 +386,14 @@ local composition = }, }; + local prometheusRule = common.GeneratePrometheusNonSLORules('redis', 'redis', []) + { + patches: [ + comp.FromCompositeFieldPathWithTransformSuffix('metadata.labels[crossplane.io/composite]', 'metadata.name', 'prometheusrule'), + comp.FromCompositeFieldPathWithTransformPrefix('metadata.labels[crossplane.io/composite]', 'spec.forProvider.manifest.metadata.namespace', 'vshn-redis'), + ], + }; + + local redisHelmChart = { apiVersion: 'helm.crossplane.io/v1beta1', @@ -407,7 +415,10 @@ local composition = { name: 'REDIS_EXPORTER_SKIP_TLS_VERIFICATION', value: 'true', - + }, + { + name: 'REDIS_EXPORTER_INCL_SYSTEM_METRICS', + value: 'true', }, ], containerSecurityContext: { @@ -537,6 +548,7 @@ local composition = comp.ToCompositeFieldPath('status.atProvider.manifest.metadata.labels[appuio.io/organization]', 'metadata.labels[appuio.io/organization]'), ], }, + prometheusRule, { name: 'namespace-conditions', base: namespace, diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index d5a680374..5fafa3d48 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -949,132 +949,142 @@ spec: - base: apiVersion: kubernetes.crossplane.io/v1alpha1 kind: Object - metadata: {} + metadata: + name: prometheusrule spec: forProvider: manifest: apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule - metadata: - name: postgresql-rules - spec: - groups: - - name: postgresql-storage - rules: - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: The volume claimed by the instance {{ $labels.name - }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} - == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1m - labels: - severity: critical - syn_team: schedar - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the volume claimed - by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is expected to fill up within four days. Currently - {{ $value | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless - on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1h - labels: - severity: warning - - name: postgresql-memory - rules: - - alert: PostgreSQLMemoryCritical - annotations: - description: |- - The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. - Please reducde the load of this instance, or increase the memory. - summary: Memory usage critical - expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / - on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} - * 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-connections - rules: - - alert: PostgreSQLConnectionsCritical - annotations: - description: |- - The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. - Please reduce the load of this instance. - summary: Connection usage critical - expr: label_replace( topk(1, sum(pg_stat_activity_count) by - (pod, namespace) > 90/100 * sum(pg_settings_max_connections) - by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication - rules: - - alert: PostgreSQLReplicationCritical - annotations: - description: The number of replicas for the instance {{ - $labels.cluster_name }} in namespace {{ $labels.namespace - }}. Please check pod counts in affected namespace. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical - summary: Replication status check - expr: pg_replication_slots_active == 0 - for: 10m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-lag - rules: - - alert: PostgreSQLReplicationLagCritical - annotations: - description: Replication lag size on namespace {{$labels.exported_namespace}} - instance ({{$labels.application_name}}) is currently {{ - $value | humanize1024}}B behind the leader. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical - summary: Replication lag status check - expr: pg_replication_status_lag_size > 1e+09 - for: 5m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-count - rules: - - alert: PostgreSQLPodReplicasCritical - annotations: - description: Replication is broken in namespace {{$labels.namespace}}, - check statefulset ({{$labels.statefulset}}). - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical - summary: Replication lag status check - expr: kube_statefulset_status_replicas_available{statefulset=~".+", - namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} - for: 5m - labels: - severity: critical - syn_team: schedar + spec: + forProvider: + manifest: + metadata: + name: PostgreSQL-rules + spec: + groups: + - name: PostgreSQL-general-alerts + rules: + - name: PostgreSQL-storage + rules: + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance + {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume + claimed by the instance {{ $labels.name }} in + namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is expected to fill up within four days. Currently + {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 1h + labels: + severity: warning + - alert: PostgreSQLMemoryCritical + annotations: + description: |- + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + Please reducde the load of this instance, or increase the memory. + summary: Memory usage critical + expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / + on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} + * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: warning + syn_team: schedar + name: postgresql-memory + - name: postgresql-connections + rules: + - alert: PostgreSQLConnectionsCritical + annotations: + description: |- + The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. + Please reduce the load of this instance. + summary: Connection usage critical + expr: label_replace( topk(1, sum(pg_stat_activity_count) + by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) + by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication + rules: + - alert: PostgreSQLReplicationCritical + annotations: + description: The number of replicas for the instance + {{ $labels.cluster_name }} in namespace {{ $labels.namespace + }}. Please check pod counts in affected namespace. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical + summary: Replication status check + expr: pg_replication_slots_active == 0 + for: 10m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-lag + rules: + - alert: PostgreSQLReplicationLagCritical + annotations: + description: Replication lag size on namespace {{$labels.exported_namespace}} + instance ({{$labels.application_name}}) is currently + {{ $value | humanize1024}}B behind the leader. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical + summary: Replication lag status check + expr: pg_replication_status_lag_size > 1e+09 + for: 5m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-count + rules: + - alert: PostgreSQLPodReplicasCritical + annotations: + description: Replication is broken in namespace + {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}). + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical + summary: Replication lag status check + expr: kube_statefulset_status_replicas_available{statefulset=~".+", + namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} + for: 5m + labels: + severity: critical + syn_team: schedar providerConfigRef: name: kubernetes name: prometheusrule diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index ee6e7fda2..63f3afb53 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1051,132 +1051,142 @@ spec: - base: apiVersion: kubernetes.crossplane.io/v1alpha1 kind: Object - metadata: {} + metadata: + name: prometheusrule spec: forProvider: manifest: apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule - metadata: - name: postgresql-rules - spec: - groups: - - name: postgresql-storage - rules: - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: The volume claimed by the instance {{ $labels.name - }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} - == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1m - labels: - severity: critical - syn_team: schedar - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the volume claimed - by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is expected to fill up within four days. Currently - {{ $value | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless - on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1h - labels: - severity: warning - - name: postgresql-memory - rules: - - alert: PostgreSQLMemoryCritical - annotations: - description: |- - The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. - Please reducde the load of this instance, or increase the memory. - summary: Memory usage critical - expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / - on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} - * 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-connections - rules: - - alert: PostgreSQLConnectionsCritical - annotations: - description: |- - The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. - Please reduce the load of this instance. - summary: Connection usage critical - expr: label_replace( topk(1, sum(pg_stat_activity_count) by - (pod, namespace) > 90/100 * sum(pg_settings_max_connections) - by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication - rules: - - alert: PostgreSQLReplicationCritical - annotations: - description: The number of replicas for the instance {{ - $labels.cluster_name }} in namespace {{ $labels.namespace - }}. Please check pod counts in affected namespace. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical - summary: Replication status check - expr: pg_replication_slots_active == 0 - for: 10m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-lag - rules: - - alert: PostgreSQLReplicationLagCritical - annotations: - description: Replication lag size on namespace {{$labels.exported_namespace}} - instance ({{$labels.application_name}}) is currently {{ - $value | humanize1024}}B behind the leader. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical - summary: Replication lag status check - expr: pg_replication_status_lag_size > 1e+09 - for: 5m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-count - rules: - - alert: PostgreSQLPodReplicasCritical - annotations: - description: Replication is broken in namespace {{$labels.namespace}}, - check statefulset ({{$labels.statefulset}}). - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical - summary: Replication lag status check - expr: kube_statefulset_status_replicas_available{statefulset=~".+", - namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} - for: 5m - labels: - severity: critical - syn_team: schedar + spec: + forProvider: + manifest: + metadata: + name: PostgreSQL-rules + spec: + groups: + - name: PostgreSQL-general-alerts + rules: + - name: PostgreSQL-storage + rules: + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance + {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume + claimed by the instance {{ $labels.name }} in + namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is expected to fill up within four days. Currently + {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 1h + labels: + severity: warning + - alert: PostgreSQLMemoryCritical + annotations: + description: |- + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + Please reducde the load of this instance, or increase the memory. + summary: Memory usage critical + expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / + on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} + * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: warning + syn_team: schedar + name: postgresql-memory + - name: postgresql-connections + rules: + - alert: PostgreSQLConnectionsCritical + annotations: + description: |- + The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. + Please reduce the load of this instance. + summary: Connection usage critical + expr: label_replace( topk(1, sum(pg_stat_activity_count) + by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) + by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication + rules: + - alert: PostgreSQLReplicationCritical + annotations: + description: The number of replicas for the instance + {{ $labels.cluster_name }} in namespace {{ $labels.namespace + }}. Please check pod counts in affected namespace. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical + summary: Replication status check + expr: pg_replication_slots_active == 0 + for: 10m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-lag + rules: + - alert: PostgreSQLReplicationLagCritical + annotations: + description: Replication lag size on namespace {{$labels.exported_namespace}} + instance ({{$labels.application_name}}) is currently + {{ $value | humanize1024}}B behind the leader. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical + summary: Replication lag status check + expr: pg_replication_status_lag_size > 1e+09 + for: 5m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-count + rules: + - alert: PostgreSQLPodReplicasCritical + annotations: + description: Replication is broken in namespace + {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}). + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical + summary: Replication lag status check + expr: kube_statefulset_status_replicas_available{statefulset=~".+", + namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} + for: 5m + labels: + severity: critical + syn_team: schedar providerConfigRef: name: kubernetes name: prometheusrule diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index 7008a1f42..8e7cac91b 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -83,6 +83,106 @@ spec: - fromFieldPath: status.atProvider.manifest.metadata.labels[appuio.io/organization] toFieldPath: metadata.labels[appuio.io/organization] type: ToCompositeFieldPath + - base: + apiVersion: kubernetes.crossplane.io/v1alpha1 + kind: Object + metadata: + name: prometheusrule + spec: + forProvider: + manifest: + apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule + spec: + forProvider: + manifest: + metadata: + name: redis-rules + spec: + groups: + - name: redis-general-alerts + rules: + - name: redis-storage + rules: + - alert: redisPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance + {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-redis-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + - alert: redisPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume + claimed by the instance {{ $labels.name }} in + namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is expected to fill up within four days. Currently + {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-redis-(.+)-.+") + for: 1h + labels: + severity: warning + - alert: redisMemoryCritical + annotations: + description: |- + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + Please reducde the load of this instance, or increase the memory. + summary: Memory usage critical + expr: label_replace( topk(1, (container_memory_working_set_bytes{container="redis"} / + on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} + * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") + for: 120m + labels: + severity: warning + syn_team: schedar + name: redis-memory + providerConfigRef: + name: kubernetes + name: prometheusrule + patches: + - fromFieldPath: metadata.labels[crossplane.io/composite] + toFieldPath: metadata.name + transforms: + - string: + fmt: '%s-prometheusrule' + type: Format + type: string + type: FromCompositeFieldPath + - fromFieldPath: metadata.labels[crossplane.io/composite] + toFieldPath: spec.forProvider.manifest.metadata.namespace + transforms: + - string: + fmt: vshn-redis-%s + type: Format + type: string + type: FromCompositeFieldPath - base: apiVersion: kubernetes.crossplane.io/v1alpha1 kind: Object @@ -706,6 +806,8 @@ spec: extraEnvVars: - name: REDIS_EXPORTER_SKIP_TLS_VERIFICATION value: 'true' + - name: REDIS_EXPORTER_INCL_SYSTEM_METRICS + value: 'true' serviceMonitor: enabled: true namespace: '' From 0f4cd458f764980cca2d18bdc553c015ece00e78 Mon Sep 17 00:00:00 2001 From: "lukasz.widera@vshn.ch" Date: Tue, 14 Nov 2023 11:49:44 +0100 Subject: [PATCH 02/11] solving duplicated spec issue --- component/component/common.libsonnet | 134 +++++----- .../appcat/21_composition_vshn_postgres.yaml | 241 +++++++++--------- .../21_composition_vshn_postgresrestore.yaml | 241 +++++++++--------- .../appcat/21_composition_vshn_redis.yaml | 128 +++++----- 4 files changed, 359 insertions(+), 385 deletions(-) diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index 58ecbb229..d63c76e0f 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -192,81 +192,75 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional providerConfigRef: { name: 'kubernetes', }, - forProvider: { - manifest: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'PrometheusRule', - }, - spec+: { - forProvider+: { - manifest+: { - metadata: { - name: '%s-rules' % serviceName, - }, - spec: { - groups: [ + forProvider+: { + manifest+: { + metadata: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + name: '%s-rules' % serviceName, + }, + spec: { + groups: [ + { + name: '%s-general-alerts' % serviceName, + rules: [ { - name: '%s-general-alerts' % serviceName, - rules: [ - { - name: '%s-storage' % serviceName, - local queries = { - availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', - availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, - usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', - unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', - }, - rules: - [ - { - alert: serviceName + 'PersistentVolumeFillingUp', - annotations: { - description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), - 'for': '1m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - { - alert: serviceName + 'PersistentVolumeFillingUp', - annotations: { - description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), - 'for': '1h', - labels: { - severity: 'warning', - }, - }, - ], - }, - { - alert: serviceName + 'MemoryCritical', - name: std.asciiLower(serviceName) + '-memory', - annotations: { - description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', - // runbook_url: 'TBD', - summary: 'Memory usage critical', + name: '%s-storage' % serviceName, + local queries = { + availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', + availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, + usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', + unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', + }, + rules: + [ + { + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), + 'for': '1m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, }, - expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 90') % memoryContainerName, toReplace, 'vshn-' + std.asciiLower(serviceName)), - 'for': '120m', - labels: { - severity: 'warning', - syn_team: 'schedar', + { + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), + 'for': '1h', + labels: { + severity: 'warning', + }, }, - }, - ] + additionalAlerts, + ], }, - ], + { + alert: serviceName + 'MemoryCritical', + name: std.asciiLower(serviceName) + '-memory', + annotations: { + description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', + // runbook_url: 'TBD', + summary: 'Memory usage critical', + }, + expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 90') % memoryContainerName, toReplace, 'vshn-' + std.asciiLower(serviceName)), + 'for': '120m', + labels: { + severity: 'warning', + syn_team: 'schedar', + }, + }, + ] + additionalAlerts, }, - }, + ], }, }, }, diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index 5fafa3d48..f4bc1b0bf 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -954,137 +954,130 @@ spec: spec: forProvider: manifest: - apiVersion: monitoring.coreos.com/v1 - kind: PrometheusRule - spec: - forProvider: - manifest: - metadata: - name: PostgreSQL-rules - spec: - groups: - - name: PostgreSQL-general-alerts + metadata: + apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule + name: PostgreSQL-rules + spec: + groups: + - name: PostgreSQL-general-alerts + rules: + - name: PostgreSQL-storage rules: - - name: PostgreSQL-storage - rules: - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: The volume claimed by the instance - {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", - "vshn-postgresql-(.+)-.+") - for: 1m - labels: - severity: critical - syn_team: schedar - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the volume - claimed by the instance {{ $labels.name }} in - namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is expected to fill up within four days. Currently - {{ $value | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless - on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", - "vshn-postgresql-(.+)-.+") - for: 1h - labels: - severity: warning - - alert: PostgreSQLMemoryCritical + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance {{ $labels.name + }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} + == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume claimed + by the instance {{ $labels.name }} in namespace {{ + $labels.label_appcat_vshn_io_claim_namespace }} is + expected to fill up within four days. Currently {{ + $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 1h + labels: + severity: warning + - alert: PostgreSQLMemoryCritical + annotations: + description: |- + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + Please reducde the load of this instance, or increase the memory. + summary: Memory usage critical + expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / + on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} + * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: warning + syn_team: schedar + name: postgresql-memory + - name: postgresql-connections + rules: + - alert: PostgreSQLConnectionsCritical annotations: description: |- - The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. - Please reducde the load of this instance, or increase the memory. - summary: Memory usage critical - expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / - on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} - * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. + Please reduce the load of this instance. + summary: Connection usage critical + expr: label_replace( topk(1, sum(pg_stat_activity_count) + by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) + by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") for: 120m labels: - severity: warning + severity: critical + syn_team: schedar + - name: postgresql-replication + rules: + - alert: PostgreSQLReplicationCritical + annotations: + description: The number of replicas for the instance + {{ $labels.cluster_name }} in namespace {{ $labels.namespace + }}. Please check pod counts in affected namespace. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical + summary: Replication status check + expr: pg_replication_slots_active == 0 + for: 10m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-lag + rules: + - alert: PostgreSQLReplicationLagCritical + annotations: + description: Replication lag size on namespace {{$labels.exported_namespace}} + instance ({{$labels.application_name}}) is currently + {{ $value | humanize1024}}B behind the leader. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical + summary: Replication lag status check + expr: pg_replication_status_lag_size > 1e+09 + for: 5m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-count + rules: + - alert: PostgreSQLPodReplicasCritical + annotations: + description: Replication is broken in namespace {{$labels.namespace}}, + check statefulset ({{$labels.statefulset}}). + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical + summary: Replication lag status check + expr: kube_statefulset_status_replicas_available{statefulset=~".+", + namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} + for: 5m + labels: + severity: critical syn_team: schedar - name: postgresql-memory - - name: postgresql-connections - rules: - - alert: PostgreSQLConnectionsCritical - annotations: - description: |- - The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. - Please reduce the load of this instance. - summary: Connection usage critical - expr: label_replace( topk(1, sum(pg_stat_activity_count) - by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) - by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", - "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication - rules: - - alert: PostgreSQLReplicationCritical - annotations: - description: The number of replicas for the instance - {{ $labels.cluster_name }} in namespace {{ $labels.namespace - }}. Please check pod counts in affected namespace. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical - summary: Replication status check - expr: pg_replication_slots_active == 0 - for: 10m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-lag - rules: - - alert: PostgreSQLReplicationLagCritical - annotations: - description: Replication lag size on namespace {{$labels.exported_namespace}} - instance ({{$labels.application_name}}) is currently - {{ $value | humanize1024}}B behind the leader. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical - summary: Replication lag status check - expr: pg_replication_status_lag_size > 1e+09 - for: 5m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-count - rules: - - alert: PostgreSQLPodReplicasCritical - annotations: - description: Replication is broken in namespace - {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}). - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical - summary: Replication lag status check - expr: kube_statefulset_status_replicas_available{statefulset=~".+", - namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} - for: 5m - labels: - severity: critical - syn_team: schedar providerConfigRef: name: kubernetes name: prometheusrule diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index 63f3afb53..2db5f2c12 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1056,137 +1056,130 @@ spec: spec: forProvider: manifest: - apiVersion: monitoring.coreos.com/v1 - kind: PrometheusRule - spec: - forProvider: - manifest: - metadata: - name: PostgreSQL-rules - spec: - groups: - - name: PostgreSQL-general-alerts + metadata: + apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule + name: PostgreSQL-rules + spec: + groups: + - name: PostgreSQL-general-alerts + rules: + - name: PostgreSQL-storage + rules: + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance {{ $labels.name + }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} + == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume claimed + by the instance {{ $labels.name }} in namespace {{ + $labels.label_appcat_vshn_io_claim_namespace }} is + expected to fill up within four days. Currently {{ + $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 1h + labels: + severity: warning + - alert: PostgreSQLMemoryCritical + annotations: + description: |- + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + Please reducde the load of this instance, or increase the memory. + summary: Memory usage critical + expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / + on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} + * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: warning + syn_team: schedar + name: postgresql-memory + - name: postgresql-connections rules: - - name: PostgreSQL-storage - rules: - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: The volume claimed by the instance - {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", - "vshn-postgresql-(.+)-.+") - for: 1m - labels: - severity: critical - syn_team: schedar - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the volume - claimed by the instance {{ $labels.name }} in - namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is expected to fill up within four days. Currently - {{ $value | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless - on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", - "vshn-postgresql-(.+)-.+") - for: 1h - labels: - severity: warning - - alert: PostgreSQLMemoryCritical + - alert: PostgreSQLConnectionsCritical annotations: description: |- - The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. - Please reducde the load of this instance, or increase the memory. - summary: Memory usage critical - expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / - on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} - * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. + Please reduce the load of this instance. + summary: Connection usage critical + expr: label_replace( topk(1, sum(pg_stat_activity_count) + by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) + by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") for: 120m labels: - severity: warning + severity: critical + syn_team: schedar + - name: postgresql-replication + rules: + - alert: PostgreSQLReplicationCritical + annotations: + description: The number of replicas for the instance + {{ $labels.cluster_name }} in namespace {{ $labels.namespace + }}. Please check pod counts in affected namespace. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical + summary: Replication status check + expr: pg_replication_slots_active == 0 + for: 10m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-lag + rules: + - alert: PostgreSQLReplicationLagCritical + annotations: + description: Replication lag size on namespace {{$labels.exported_namespace}} + instance ({{$labels.application_name}}) is currently + {{ $value | humanize1024}}B behind the leader. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical + summary: Replication lag status check + expr: pg_replication_status_lag_size > 1e+09 + for: 5m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-count + rules: + - alert: PostgreSQLPodReplicasCritical + annotations: + description: Replication is broken in namespace {{$labels.namespace}}, + check statefulset ({{$labels.statefulset}}). + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical + summary: Replication lag status check + expr: kube_statefulset_status_replicas_available{statefulset=~".+", + namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} + for: 5m + labels: + severity: critical syn_team: schedar - name: postgresql-memory - - name: postgresql-connections - rules: - - alert: PostgreSQLConnectionsCritical - annotations: - description: |- - The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. - Please reduce the load of this instance. - summary: Connection usage critical - expr: label_replace( topk(1, sum(pg_stat_activity_count) - by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) - by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", - "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication - rules: - - alert: PostgreSQLReplicationCritical - annotations: - description: The number of replicas for the instance - {{ $labels.cluster_name }} in namespace {{ $labels.namespace - }}. Please check pod counts in affected namespace. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical - summary: Replication status check - expr: pg_replication_slots_active == 0 - for: 10m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-lag - rules: - - alert: PostgreSQLReplicationLagCritical - annotations: - description: Replication lag size on namespace {{$labels.exported_namespace}} - instance ({{$labels.application_name}}) is currently - {{ $value | humanize1024}}B behind the leader. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical - summary: Replication lag status check - expr: pg_replication_status_lag_size > 1e+09 - for: 5m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-count - rules: - - alert: PostgreSQLPodReplicasCritical - annotations: - description: Replication is broken in namespace - {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}). - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical - summary: Replication lag status check - expr: kube_statefulset_status_replicas_available{statefulset=~".+", - namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} - for: 5m - labels: - severity: critical - syn_team: schedar providerConfigRef: name: kubernetes name: prometheusrule diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index 8e7cac91b..0b0e639d3 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -91,78 +91,72 @@ spec: spec: forProvider: manifest: - apiVersion: monitoring.coreos.com/v1 - kind: PrometheusRule - spec: - forProvider: - manifest: - metadata: - name: redis-rules - spec: - groups: - - name: redis-general-alerts + metadata: + apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule + name: redis-rules + spec: + groups: + - name: redis-general-alerts + rules: + - name: redis-storage rules: - - name: redis-storage - rules: - - alert: redisPersistentVolumeFillingUp - annotations: - description: The volume claimed by the instance - {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", - "vshn-redis-(.+)-.+") - for: 1m - labels: - severity: critical - syn_team: schedar - - alert: redisPersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the volume - claimed by the instance {{ $labels.name }} in - namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is expected to fill up within four days. Currently - {{ $value | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless - on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", - "vshn-redis-(.+)-.+") - for: 1h - labels: - severity: warning - - alert: redisMemoryCritical + - alert: redisPersistentVolumeFillingUp annotations: - description: |- - The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. - Please reducde the load of this instance, or increase the memory. - summary: Memory usage critical - expr: label_replace( topk(1, (container_memory_working_set_bytes{container="redis"} / - on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} - * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + description: The volume claimed by the instance {{ $labels.name + }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} + == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") - for: 120m + for: 1m labels: - severity: warning + severity: critical syn_team: schedar - name: redis-memory + - alert: redisPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume claimed + by the instance {{ $labels.name }} in namespace {{ + $labels.label_appcat_vshn_io_claim_namespace }} is + expected to fill up within four days. Currently {{ + $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") + for: 1h + labels: + severity: warning + - alert: redisMemoryCritical + annotations: + description: |- + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + Please reducde the load of this instance, or increase the memory. + summary: Memory usage critical + expr: label_replace( topk(1, (container_memory_working_set_bytes{container="redis"} / + on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} + * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") + for: 120m + labels: + severity: warning + syn_team: schedar + name: redis-memory providerConfigRef: name: kubernetes name: prometheusrule From 092bbd08fbcdf91fda318773d9a994035a57d790 Mon Sep 17 00:00:00 2001 From: "lukasz.widera@vshn.ch" Date: Tue, 14 Nov 2023 12:58:47 +0100 Subject: [PATCH 03/11] solving duplicated rules issue --- component/component/common.libsonnet | 67 +++++++-------- .../appcat/21_composition_vshn_postgres.yaml | 85 +++++++++---------- .../21_composition_vshn_postgresrestore.yaml | 85 +++++++++---------- .../appcat/21_composition_vshn_redis.yaml | 85 +++++++++---------- 4 files changed, 158 insertions(+), 164 deletions(-) diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index d63c76e0f..a42f73795 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -180,6 +180,12 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional // standardized lowercase regardless of what came as input local serviceNameLower = std.asciiLower(serviceName), local toReplace = 'vshn-replacemeplease', + local queries = { + availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', + availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, + usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', + unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', + }, name: 'prometheusrule', base: { @@ -204,44 +210,35 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional { name: '%s-general-alerts' % serviceName, rules: [ + + { + name: '%s-storage' % serviceName, + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), + 'for': '1m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, { name: '%s-storage' % serviceName, - local queries = { - availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', - availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, - usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', - unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), + 'for': '1h', + labels: { + severity: 'warning', }, - rules: - [ - { - alert: serviceName + 'PersistentVolumeFillingUp', - annotations: { - description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), - 'for': '1m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - { - alert: serviceName + 'PersistentVolumeFillingUp', - annotations: { - description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), - 'for': '1h', - labels: { - severity: 'warning', - }, - }, - ], }, { alert: serviceName + 'MemoryCritical', diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index f4bc1b0bf..ba4cff5f6 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -962,49 +962,48 @@ spec: groups: - name: PostgreSQL-general-alerts rules: - - name: PostgreSQL-storage - rules: - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: The volume claimed by the instance {{ $labels.name - }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} - == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1m - labels: - severity: critical - syn_team: schedar - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the volume claimed - by the instance {{ $labels.name }} in namespace {{ - $labels.label_appcat_vshn_io_claim_namespace }} is - expected to fill up within four days. Currently {{ - $value | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless - on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1h - labels: - severity: warning + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance {{ $labels.name + }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} + == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + name: PostgreSQL-storage + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume claimed + by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is expected to fill up within four days. Currently + {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 1h + labels: + severity: warning + name: PostgreSQL-storage - alert: PostgreSQLMemoryCritical annotations: description: |- diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index 2db5f2c12..3f621ee18 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1064,49 +1064,48 @@ spec: groups: - name: PostgreSQL-general-alerts rules: - - name: PostgreSQL-storage - rules: - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: The volume claimed by the instance {{ $labels.name - }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} - == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1m - labels: - severity: critical - syn_team: schedar - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the volume claimed - by the instance {{ $labels.name }} in namespace {{ - $labels.label_appcat_vshn_io_claim_namespace }} is - expected to fill up within four days. Currently {{ - $value | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless - on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1h - labels: - severity: warning + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance {{ $labels.name + }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} + == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + name: PostgreSQL-storage + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume claimed + by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is expected to fill up within four days. Currently + {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 1h + labels: + severity: warning + name: PostgreSQL-storage - alert: PostgreSQLMemoryCritical annotations: description: |- diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index 0b0e639d3..04b11a46b 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -99,49 +99,48 @@ spec: groups: - name: redis-general-alerts rules: - - name: redis-storage - rules: - - alert: redisPersistentVolumeFillingUp - annotations: - description: The volume claimed by the instance {{ $labels.name - }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} - == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") - for: 1m - labels: - severity: critical - syn_team: schedar - - alert: redisPersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the volume claimed - by the instance {{ $labels.name }} in namespace {{ - $labels.label_appcat_vshn_io_claim_namespace }} is - expected to fill up within four days. Currently {{ - $value | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless - on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, - persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") - for: 1h - labels: - severity: warning + - alert: redisPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance {{ $labels.name + }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} + == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + name: redis-storage + - alert: redisPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume claimed + by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is expected to fill up within four days. Currently + {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") + for: 1h + labels: + severity: warning + name: redis-storage - alert: redisMemoryCritical annotations: description: |- From da0217006f6bbfad9b80ab717d27a401eda5ca98 Mon Sep 17 00:00:00 2001 From: "lukasz.widera@vshn.ch" Date: Tue, 14 Nov 2023 13:28:05 +0100 Subject: [PATCH 04/11] solving UpperCaseRules issue --- component/component/common.libsonnet | 13 ++++++------- .../appcat/appcat/21_composition_vshn_postgres.yaml | 7 +++---- .../appcat/21_composition_vshn_postgresrestore.yaml | 7 +++---- .../appcat/appcat/21_composition_vshn_redis.yaml | 1 - 4 files changed, 12 insertions(+), 16 deletions(-) diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index a42f73795..f6d34b2c3 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -203,23 +203,23 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional metadata: { apiVersion: 'monitoring.coreos.com/v1', kind: 'PrometheusRule', - name: '%s-rules' % serviceName, + name: '%s-rules' % serviceNameLower, }, spec: { groups: [ { - name: '%s-general-alerts' % serviceName, + name: '%s-general-alerts' % serviceNameLower, rules: [ { - name: '%s-storage' % serviceName, + name: '%s-storage' % serviceNameLower, alert: serviceName + 'PersistentVolumeFillingUp', annotations: { description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', summary: 'PersistentVolume is filling up.', }, - expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower), 'for': '1m', labels: { severity: 'critical', @@ -227,14 +227,13 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional }, }, { - name: '%s-storage' % serviceName, alert: serviceName + 'PersistentVolumeFillingUp', annotations: { description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', summary: 'PersistentVolume is filling up.', }, - expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower), 'for': '1h', labels: { severity: 'warning', @@ -248,7 +247,7 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional // runbook_url: 'TBD', summary: 'Memory usage critical', }, - expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 90') % memoryContainerName, toReplace, 'vshn-' + std.asciiLower(serviceName)), + expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 90') % memoryContainerName, toReplace, 'vshn-' + serviceNameLower), 'for': '120m', labels: { severity: 'warning', diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index ba4cff5f6..9b827e02d 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -957,10 +957,10 @@ spec: metadata: apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule - name: PostgreSQL-rules + name: postgresql-rules spec: groups: - - name: PostgreSQL-general-alerts + - name: postgresql-general-alerts rules: - alert: PostgreSQLPersistentVolumeFillingUp annotations: @@ -981,7 +981,7 @@ spec: labels: severity: critical syn_team: schedar - name: PostgreSQL-storage + name: postgresql-storage - alert: PostgreSQLPersistentVolumeFillingUp annotations: description: Based on recent sampling, the volume claimed @@ -1003,7 +1003,6 @@ spec: for: 1h labels: severity: warning - name: PostgreSQL-storage - alert: PostgreSQLMemoryCritical annotations: description: |- diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index 3f621ee18..599856679 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1059,10 +1059,10 @@ spec: metadata: apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule - name: PostgreSQL-rules + name: postgresql-rules spec: groups: - - name: PostgreSQL-general-alerts + - name: postgresql-general-alerts rules: - alert: PostgreSQLPersistentVolumeFillingUp annotations: @@ -1083,7 +1083,7 @@ spec: labels: severity: critical syn_team: schedar - name: PostgreSQL-storage + name: postgresql-storage - alert: PostgreSQLPersistentVolumeFillingUp annotations: description: Based on recent sampling, the volume claimed @@ -1105,7 +1105,6 @@ spec: for: 1h labels: severity: warning - name: PostgreSQL-storage - alert: PostgreSQLMemoryCritical annotations: description: |- diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index 04b11a46b..dbad30b85 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -140,7 +140,6 @@ spec: for: 1h labels: severity: warning - name: redis-storage - alert: redisMemoryCritical annotations: description: |- From 512245740dcd0666687bb3ed44ed653934113dac Mon Sep 17 00:00:00 2001 From: "lukasz.widera@vshn.ch" Date: Tue, 14 Nov 2023 14:11:06 +0100 Subject: [PATCH 05/11] solving grouping of Alerts issue --- component/component/common.libsonnet | 5 +- .../appcat/21_composition_vshn_postgres.yaml | 116 +++++++++--------- .../21_composition_vshn_postgresrestore.yaml | 116 +++++++++--------- 3 files changed, 118 insertions(+), 119 deletions(-) diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index f6d34b2c3..3454d299b 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -210,7 +210,6 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional { name: '%s-general-alerts' % serviceNameLower, rules: [ - { name: '%s-storage' % serviceNameLower, alert: serviceName + 'PersistentVolumeFillingUp', @@ -254,9 +253,9 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional syn_team: 'schedar', }, }, - ] + additionalAlerts, + ], }, - ], + ]+ additionalAlerts, }, }, }, diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index 9b827e02d..fd6683322 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -1018,64 +1018,64 @@ spec: severity: warning syn_team: schedar name: postgresql-memory - - name: postgresql-connections - rules: - - alert: PostgreSQLConnectionsCritical - annotations: - description: |- - The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. - Please reduce the load of this instance. - summary: Connection usage critical - expr: label_replace( topk(1, sum(pg_stat_activity_count) - by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) - by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication - rules: - - alert: PostgreSQLReplicationCritical - annotations: - description: The number of replicas for the instance - {{ $labels.cluster_name }} in namespace {{ $labels.namespace - }}. Please check pod counts in affected namespace. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical - summary: Replication status check - expr: pg_replication_slots_active == 0 - for: 10m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-lag - rules: - - alert: PostgreSQLReplicationLagCritical - annotations: - description: Replication lag size on namespace {{$labels.exported_namespace}} - instance ({{$labels.application_name}}) is currently - {{ $value | humanize1024}}B behind the leader. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical - summary: Replication lag status check - expr: pg_replication_status_lag_size > 1e+09 - for: 5m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-count - rules: - - alert: PostgreSQLPodReplicasCritical - annotations: - description: Replication is broken in namespace {{$labels.namespace}}, - check statefulset ({{$labels.statefulset}}). - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical - summary: Replication lag status check - expr: kube_statefulset_status_replicas_available{statefulset=~".+", - namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} - for: 5m - labels: - severity: critical - syn_team: schedar + - name: postgresql-connections + rules: + - alert: PostgreSQLConnectionsCritical + annotations: + description: |- + The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. + Please reduce the load of this instance. + summary: Connection usage critical + expr: label_replace( topk(1, sum(pg_stat_activity_count) by + (pod, namespace) > 90/100 * sum(pg_settings_max_connections) + by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication + rules: + - alert: PostgreSQLReplicationCritical + annotations: + description: The number of replicas for the instance {{ + $labels.cluster_name }} in namespace {{ $labels.namespace + }}. Please check pod counts in affected namespace. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical + summary: Replication status check + expr: pg_replication_slots_active == 0 + for: 10m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-lag + rules: + - alert: PostgreSQLReplicationLagCritical + annotations: + description: Replication lag size on namespace {{$labels.exported_namespace}} + instance ({{$labels.application_name}}) is currently {{ + $value | humanize1024}}B behind the leader. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical + summary: Replication lag status check + expr: pg_replication_status_lag_size > 1e+09 + for: 5m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-count + rules: + - alert: PostgreSQLPodReplicasCritical + annotations: + description: Replication is broken in namespace {{$labels.namespace}}, + check statefulset ({{$labels.statefulset}}). + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical + summary: Replication lag status check + expr: kube_statefulset_status_replicas_available{statefulset=~".+", + namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} + for: 5m + labels: + severity: critical + syn_team: schedar providerConfigRef: name: kubernetes name: prometheusrule diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index 599856679..5a18c1b04 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1120,64 +1120,64 @@ spec: severity: warning syn_team: schedar name: postgresql-memory - - name: postgresql-connections - rules: - - alert: PostgreSQLConnectionsCritical - annotations: - description: |- - The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. - Please reduce the load of this instance. - summary: Connection usage critical - expr: label_replace( topk(1, sum(pg_stat_activity_count) - by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) - by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication - rules: - - alert: PostgreSQLReplicationCritical - annotations: - description: The number of replicas for the instance - {{ $labels.cluster_name }} in namespace {{ $labels.namespace - }}. Please check pod counts in affected namespace. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical - summary: Replication status check - expr: pg_replication_slots_active == 0 - for: 10m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-lag - rules: - - alert: PostgreSQLReplicationLagCritical - annotations: - description: Replication lag size on namespace {{$labels.exported_namespace}} - instance ({{$labels.application_name}}) is currently - {{ $value | humanize1024}}B behind the leader. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical - summary: Replication lag status check - expr: pg_replication_status_lag_size > 1e+09 - for: 5m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-count - rules: - - alert: PostgreSQLPodReplicasCritical - annotations: - description: Replication is broken in namespace {{$labels.namespace}}, - check statefulset ({{$labels.statefulset}}). - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical - summary: Replication lag status check - expr: kube_statefulset_status_replicas_available{statefulset=~".+", - namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} - for: 5m - labels: - severity: critical - syn_team: schedar + - name: postgresql-connections + rules: + - alert: PostgreSQLConnectionsCritical + annotations: + description: |- + The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. + Please reduce the load of this instance. + summary: Connection usage critical + expr: label_replace( topk(1, sum(pg_stat_activity_count) by + (pod, namespace) > 90/100 * sum(pg_settings_max_connections) + by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication + rules: + - alert: PostgreSQLReplicationCritical + annotations: + description: The number of replicas for the instance {{ + $labels.cluster_name }} in namespace {{ $labels.namespace + }}. Please check pod counts in affected namespace. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical + summary: Replication status check + expr: pg_replication_slots_active == 0 + for: 10m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-lag + rules: + - alert: PostgreSQLReplicationLagCritical + annotations: + description: Replication lag size on namespace {{$labels.exported_namespace}} + instance ({{$labels.application_name}}) is currently {{ + $value | humanize1024}}B behind the leader. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical + summary: Replication lag status check + expr: pg_replication_status_lag_size > 1e+09 + for: 5m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-count + rules: + - alert: PostgreSQLPodReplicasCritical + annotations: + description: Replication is broken in namespace {{$labels.namespace}}, + check statefulset ({{$labels.statefulset}}). + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical + summary: Replication lag status check + expr: kube_statefulset_status_replicas_available{statefulset=~".+", + namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} + for: 5m + labels: + severity: critical + syn_team: schedar providerConfigRef: name: kubernetes name: prometheusrule From 72d4f457abad31b22754ba7ccdbee61e68984575 Mon Sep 17 00:00:00 2001 From: "lukasz.widera@vshn.ch" Date: Tue, 14 Nov 2023 14:25:09 +0100 Subject: [PATCH 06/11] renaming variable --- component/component/common.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index 3454d299b..bead909cd 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -176,7 +176,7 @@ local argoCDAnnotations() = { local bottomPod(query) = 'label_replace( bottomk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; local topPod(query) = 'label_replace( topk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; -local generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts) = { +local generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup) = { // standardized lowercase regardless of what came as input local serviceNameLower = std.asciiLower(serviceName), local toReplace = 'vshn-replacemeplease', @@ -255,7 +255,7 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional }, ], }, - ]+ additionalAlerts, + ]+ additionalAlertsRuleGroup, }, }, }, @@ -294,8 +294,8 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional removeField(obj, name), ArgoCDAnnotations(): argoCDAnnotations(), - GeneratePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts): - generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts), + GeneratePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup): + generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup), topPod(query): topPod(query), bottomPod(query): From 78b55055bf89dc04b8ae890c78819f9e7b124a2e Mon Sep 17 00:00:00 2001 From: "lukasz.widera@vshn.ch" Date: Tue, 14 Nov 2023 14:26:54 +0100 Subject: [PATCH 07/11] fixing linter --- component/component/common.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index bead909cd..9a6b40ae6 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -255,7 +255,7 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional }, ], }, - ]+ additionalAlertsRuleGroup, + ] + additionalAlertsRuleGroup, }, }, }, From 26a4794a843009c2bca0d242c0ec7ec4747aff9f Mon Sep 17 00:00:00 2001 From: "lukasz.widera@vshn.ch" Date: Tue, 14 Nov 2023 14:31:58 +0100 Subject: [PATCH 08/11] restoring seevrity --- component/component/common.libsonnet | 2 +- .../golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml | 2 +- .../vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml | 2 +- .../golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index 9a6b40ae6..e1b950c0a 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -249,7 +249,7 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 90') % memoryContainerName, toReplace, 'vshn-' + serviceNameLower), 'for': '120m', labels: { - severity: 'warning', + severity: 'critical', syn_team: 'schedar', }, }, diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index fd6683322..1d04032f5 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -1015,7 +1015,7 @@ spec: kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") for: 120m labels: - severity: warning + severity: critical syn_team: schedar name: postgresql-memory - name: postgresql-connections diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index 5a18c1b04..0ebad1b20 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1117,7 +1117,7 @@ spec: kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") for: 120m labels: - severity: warning + severity: critical syn_team: schedar name: postgresql-memory - name: postgresql-connections diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index dbad30b85..9dc04aa3c 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -152,7 +152,7 @@ spec: kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") for: 120m labels: - severity: warning + severity: critical syn_team: schedar name: redis-memory providerConfigRef: From 71f25ebd0a0b6604dbc5d48e8cebe72459f26806 Mon Sep 17 00:00:00 2001 From: Bigli <9610820+TheBigLee@users.noreply.github.com> Date: Fri, 17 Nov 2023 12:52:55 +0100 Subject: [PATCH 09/11] Update component/component/common.libsonnet --- component/component/common.libsonnet | 8 ++++---- .../vshn/appcat/appcat/21_composition_vshn_postgres.yaml | 8 ++++---- .../appcat/21_composition_vshn_postgresrestore.yaml | 8 ++++---- .../vshn/appcat/appcat/21_composition_vshn_redis.yaml | 8 ++++---- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index e1b950c0a..2e6be9510 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -200,9 +200,9 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional }, forProvider+: { manifest+: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', metadata: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'PrometheusRule', name: '%s-rules' % serviceNameLower, }, spec: { @@ -242,11 +242,11 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional alert: serviceName + 'MemoryCritical', name: std.asciiLower(serviceName) + '-memory', annotations: { - description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', + description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', // runbook_url: 'TBD', summary: 'Memory usage critical', }, - expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 90') % memoryContainerName, toReplace, 'vshn-' + serviceNameLower), + expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 85') % memoryContainerName, toReplace, 'vshn-' + serviceNameLower), 'for': '120m', labels: { severity: 'critical', diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index 1d04032f5..cec83ca65 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -954,9 +954,9 @@ spec: spec: forProvider: manifest: + apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule metadata: - apiVersion: monitoring.coreos.com/v1 - kind: PrometheusRule name: postgresql-rules spec: groups: @@ -1006,12 +1006,12 @@ spec: - alert: PostgreSQLMemoryCritical annotations: description: |- - The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. Please reducde the load of this instance, or increase the memory. summary: Memory usage critical expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} - * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + * 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") for: 120m labels: diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index 0ebad1b20..04e97c2e4 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1056,9 +1056,9 @@ spec: spec: forProvider: manifest: + apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule metadata: - apiVersion: monitoring.coreos.com/v1 - kind: PrometheusRule name: postgresql-rules spec: groups: @@ -1108,12 +1108,12 @@ spec: - alert: PostgreSQLMemoryCritical annotations: description: |- - The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. Please reducde the load of this instance, or increase the memory. summary: Memory usage critical expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} - * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + * 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") for: 120m labels: diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index 9dc04aa3c..e32e8dc3c 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -91,9 +91,9 @@ spec: spec: forProvider: manifest: + apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule metadata: - apiVersion: monitoring.coreos.com/v1 - kind: PrometheusRule name: redis-rules spec: groups: @@ -143,12 +143,12 @@ spec: - alert: redisMemoryCritical annotations: description: |- - The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. Please reducde the load of this instance, or increase the memory. summary: Memory usage critical expr: label_replace( topk(1, (container_memory_working_set_bytes{container="redis"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} - * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + * 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") for: 120m labels: From 5adf8b8ba72db46b828d3edb880f89a2ef01196d Mon Sep 17 00:00:00 2001 From: Nicolas Bigler Date: Fri, 17 Nov 2023 13:20:14 +0100 Subject: [PATCH 10/11] Fix broken prometheusRules, add missing Runbook for alert Signed-off-by: Nicolas Bigler --- component/component/common.libsonnet | 10 +++++++--- component/component/vshn_postgres.jsonnet | 3 +-- .../appcat/appcat/21_composition_vshn_postgres.yaml | 7 ++++--- .../appcat/21_composition_vshn_postgresrestore.yaml | 7 ++++--- .../vshn/appcat/appcat/21_composition_vshn_redis.yaml | 6 +++--- docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc | 9 ++++++++- 6 files changed, 27 insertions(+), 15 deletions(-) diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index 2e6be9510..68982eeee 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -208,10 +208,10 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional spec: { groups: [ { - name: '%s-general-alerts' % serviceNameLower, + name: '%s-storage' % serviceNameLower, rules: [ { - name: '%s-storage' % serviceNameLower, + alert: serviceName + 'PersistentVolumeFillingUp', annotations: { description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', @@ -238,9 +238,13 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional severity: 'warning', }, }, + ], + }, + { + name: std.asciiLower(serviceName) + '-memory', + rules: [ { alert: serviceName + 'MemoryCritical', - name: std.asciiLower(serviceName) + '-memory', annotations: { description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', // runbook_url: 'TBD', diff --git a/component/component/vshn_postgres.jsonnet b/component/component/vshn_postgres.jsonnet index e7b37d56d..bd0269dc8 100644 --- a/component/component/vshn_postgres.jsonnet +++ b/component/component/vshn_postgres.jsonnet @@ -774,7 +774,7 @@ local prometheusRule = common.GeneratePrometheusNonSLORules( alert: 'PostgreSQLConnectionsCritical', annotations: { description: 'The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours.\n Please reduce the load of this instance.', - // runbook_url: 'TBD', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLConnectionsCritical', summary: 'Connection usage critical', }, @@ -787,7 +787,6 @@ local prometheusRule = common.GeneratePrometheusNonSLORules( }, ], }, - // new { name: 'postgresql-replication', rules: [ diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index cec83ca65..26f28b658 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -960,7 +960,7 @@ spec: name: postgresql-rules spec: groups: - - name: postgresql-general-alerts + - name: postgresql-storage rules: - alert: PostgreSQLPersistentVolumeFillingUp annotations: @@ -981,7 +981,6 @@ spec: labels: severity: critical syn_team: schedar - name: postgresql-storage - alert: PostgreSQLPersistentVolumeFillingUp annotations: description: Based on recent sampling, the volume claimed @@ -1003,6 +1002,8 @@ spec: for: 1h labels: severity: warning + - name: postgresql-memory + rules: - alert: PostgreSQLMemoryCritical annotations: description: |- @@ -1017,7 +1018,6 @@ spec: labels: severity: critical syn_team: schedar - name: postgresql-memory - name: postgresql-connections rules: - alert: PostgreSQLConnectionsCritical @@ -1025,6 +1025,7 @@ spec: description: |- The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. Please reduce the load of this instance. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLConnectionsCritical summary: Connection usage critical expr: label_replace( topk(1, sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index 04e97c2e4..760fde789 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1062,7 +1062,7 @@ spec: name: postgresql-rules spec: groups: - - name: postgresql-general-alerts + - name: postgresql-storage rules: - alert: PostgreSQLPersistentVolumeFillingUp annotations: @@ -1083,7 +1083,6 @@ spec: labels: severity: critical syn_team: schedar - name: postgresql-storage - alert: PostgreSQLPersistentVolumeFillingUp annotations: description: Based on recent sampling, the volume claimed @@ -1105,6 +1104,8 @@ spec: for: 1h labels: severity: warning + - name: postgresql-memory + rules: - alert: PostgreSQLMemoryCritical annotations: description: |- @@ -1119,7 +1120,6 @@ spec: labels: severity: critical syn_team: schedar - name: postgresql-memory - name: postgresql-connections rules: - alert: PostgreSQLConnectionsCritical @@ -1127,6 +1127,7 @@ spec: description: |- The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. Please reduce the load of this instance. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLConnectionsCritical summary: Connection usage critical expr: label_replace( topk(1, sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index e32e8dc3c..61d0d970b 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -97,7 +97,7 @@ spec: name: redis-rules spec: groups: - - name: redis-general-alerts + - name: redis-storage rules: - alert: redisPersistentVolumeFillingUp annotations: @@ -118,7 +118,6 @@ spec: labels: severity: critical syn_team: schedar - name: redis-storage - alert: redisPersistentVolumeFillingUp annotations: description: Based on recent sampling, the volume claimed @@ -140,6 +139,8 @@ spec: for: 1h labels: severity: warning + - name: redis-memory + rules: - alert: redisMemoryCritical annotations: description: |- @@ -154,7 +155,6 @@ spec: labels: severity: critical syn_team: schedar - name: redis-memory providerConfigRef: name: kubernetes name: prometheusrule diff --git a/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc b/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc index b9d9ed15a..147cc6812 100644 --- a/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc +++ b/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc @@ -167,8 +167,15 @@ This alert fires when there are issues with statefullset responsible for replica ``` kubectl describe -n vshn-postgresql- sts -## for exmaple: kubectl -n vshn-postgresql-test-cluster-always-true-jnlj4 describe sts test-cluster-always-true-jnlj4 +## for example: kubectl -n vshn-postgresql-test-cluster-always-true-jnlj4 describe sts test-cluster-always-true-jnlj4 ## get events from affected namespace and look for issues k -n vshn-postgresql-test-cluster-always-true-jnlj4 get events ``` + +[[PostgreSQLConnectionsCritical]] +== PostgreSQLConnectionsCritical + +This alert fires when the used connection is over 90% of the configured `max_connections` limit (defaults to 100). +It means that either the connection limit is set too low or an application is misbehaving and spawning too many connections. +You either need to raise the `max_connections` parameter on the PostgreSQL instance or debug the application, as it might be misbehaving and spawning too many connections. From 11fcebe9a7bd5656216a540a315587348185335f Mon Sep 17 00:00:00 2001 From: Nicolas Bigler Date: Fri, 17 Nov 2023 13:56:57 +0100 Subject: [PATCH 11/11] Refactor prometheus function into separate file, add missing runbook Signed-off-by: Nicolas Bigler --- component/component/common.libsonnet | 127 ---------------- component/component/prometheus.libsonnet | 136 ++++++++++++++++++ component/component/vshn_minio.jsonnet | 6 +- component/component/vshn_postgres.jsonnet | 11 +- component/component/vshn_redis.jsonnet | 8 +- .../appcat/21_composition_vshn_postgres.yaml | 1 + .../21_composition_vshn_postgresrestore.yaml | 1 + .../appcat/21_composition_vshn_redis.yaml | 1 + .../ROOT/pages/runbooks/vshn-generic.adoc | 9 ++ 9 files changed, 160 insertions(+), 140 deletions(-) create mode 100644 component/component/prometheus.libsonnet create mode 100644 docs/modules/ROOT/pages/runbooks/vshn-generic.adoc diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index 68982eeee..ca71d9e35 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -134,31 +134,6 @@ local getAppCatImageString() = params.images.appcat.registry + '/' + params.imag local getApiserverImageString() = params.images.apiserver.registry + '/' + params.images.apiserver.repository + ':' + getApiserverImageTag(); -local promRuleSLA(value, service) = kube._Object('monitoring.coreos.com/v1', 'PrometheusRule', 'vshn-' + std.asciiLower(service) + '-sla') { - metadata+: { - labels: { - name: 'vshn-' + std.asciiLower(service) + '-sla', - }, - namespace: params.slos.namespace, - }, - spec: { - groups: [ - { - name: 'appcat-' + std.asciiLower(service) + '-sla-target', - rules: [ - { - expr: 'vector(' + value + ')', - labels: { - service: service, - }, - record: 'sla:objective:ratio', - }, - ], - }, - ], - }, -}; - local removeField(obj, name) = { // We don't want the name field in the actual providerConfig [k]: obj[k] @@ -173,100 +148,6 @@ local argoCDAnnotations() = { 'argocd.argoproj.io/sync-options': 'Prune=false', }; -local bottomPod(query) = 'label_replace( bottomk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; -local topPod(query) = 'label_replace( topk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; - -local generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup) = { - // standardized lowercase regardless of what came as input - local serviceNameLower = std.asciiLower(serviceName), - local toReplace = 'vshn-replacemeplease', - local queries = { - availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', - availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, - usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', - unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', - }, - name: 'prometheusrule', - base: { - - apiVersion: 'kubernetes.crossplane.io/v1alpha1', - kind: 'Object', - metadata: { - name: 'prometheusrule', - }, - spec: { - providerConfigRef: { - name: 'kubernetes', - }, - forProvider+: { - manifest+: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'PrometheusRule', - metadata: { - name: '%s-rules' % serviceNameLower, - }, - spec: { - groups: [ - { - name: '%s-storage' % serviceNameLower, - rules: [ - { - - alert: serviceName + 'PersistentVolumeFillingUp', - annotations: { - description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower), - 'for': '1m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - { - alert: serviceName + 'PersistentVolumeFillingUp', - annotations: { - description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower), - 'for': '1h', - labels: { - severity: 'warning', - }, - }, - ], - }, - { - name: std.asciiLower(serviceName) + '-memory', - rules: [ - { - alert: serviceName + 'MemoryCritical', - annotations: { - description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', - // runbook_url: 'TBD', - summary: 'Memory usage critical', - }, - expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 85') % memoryContainerName, toReplace, 'vshn-' + serviceNameLower), - 'for': '120m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - ] + additionalAlertsRuleGroup, - }, - }, - }, - }, - }, -}; - { SyncOptions: syncOptions, @@ -292,16 +173,8 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional getApiserverImageTag(), GetApiserverImageString(): getApiserverImageString(), - PromRuleSLA(value, service): - promRuleSLA(value, service), RemoveField(obj, name): removeField(obj, name), ArgoCDAnnotations(): argoCDAnnotations(), - GeneratePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup): - generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup), - topPod(query): - topPod(query), - bottomPod(query): - bottomPod(query), } diff --git a/component/component/prometheus.libsonnet b/component/component/prometheus.libsonnet new file mode 100644 index 000000000..13f08899b --- /dev/null +++ b/component/component/prometheus.libsonnet @@ -0,0 +1,136 @@ +local kap = import 'lib/kapitan.libjsonnet'; +local kube = import 'lib/kube.libjsonnet'; + +local inv = kap.inventory(); +local params = inv.parameters.appcat; + + +local promRuleSLA(value, service) = kube._Object('monitoring.coreos.com/v1', 'PrometheusRule', 'vshn-' + std.asciiLower(service) + '-sla') { + metadata+: { + labels: { + name: 'vshn-' + std.asciiLower(service) + '-sla', + }, + namespace: params.slos.namespace, + }, + spec: { + groups: [ + { + name: 'appcat-' + std.asciiLower(service) + '-sla-target', + rules: [ + { + expr: 'vector(' + value + ')', + labels: { + service: service, + }, + record: 'sla:objective:ratio', + }, + ], + }, + ], + }, +}; + +local bottomPod(query) = 'label_replace( bottomk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; +local topPod(query) = 'label_replace( topk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; + +local generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup) = { + // standardized lowercase regardless of what came as input + local serviceNameLower = std.asciiLower(serviceName), + local toReplace = 'vshn-replacemeplease', + local queries = { + availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', + availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, + usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', + unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', + }, + name: 'prometheusrule', + base: { + + apiVersion: 'kubernetes.crossplane.io/v1alpha1', + kind: 'Object', + metadata: { + name: 'prometheusrule', + }, + spec: { + providerConfigRef: { + name: 'kubernetes', + }, + forProvider+: { + manifest+: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: '%s-rules' % serviceNameLower, + }, + spec: { + groups: [ + { + name: '%s-storage' % serviceNameLower, + rules: [ + { + + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower), + 'for': '1m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + { + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower), + 'for': '1h', + labels: { + severity: 'warning', + }, + }, + ], + }, + { + name: std.asciiLower(serviceName) + '-memory', + rules: [ + { + alert: serviceName + 'MemoryCritical', + annotations: { + description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical', + summary: 'Memory usage critical', + }, + expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 85') % memoryContainerName, toReplace, 'vshn-' + serviceNameLower), + 'for': '120m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + ], + }, + ] + additionalAlertsRuleGroup, + }, + }, + }, + }, + }, +}; + +{ + GeneratePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup): + generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup), + PromRuleSLA(value, service): + promRuleSLA(value, service), + TopPod(query): + topPod(query), + BottomPod(query): + bottomPod(query), +} diff --git a/component/component/vshn_minio.jsonnet b/component/component/vshn_minio.jsonnet index 6649c40be..b3b94f3ef 100644 --- a/component/component/vshn_minio.jsonnet +++ b/component/component/vshn_minio.jsonnet @@ -6,9 +6,9 @@ local comp = import 'lib/appcat-compositions.libsonnet'; local crossplane = import 'lib/crossplane.libsonnet'; local common = import 'common.libsonnet'; -local xrds = import 'xrds.libsonnet'; - +local prom = import 'prometheus.libsonnet'; local slos = import 'slos.libsonnet'; +local xrds = import 'xrds.libsonnet'; local inv = kap.inventory(); local params = inv.parameters.appcat; @@ -23,7 +23,7 @@ local connectionSecretKeys = [ 'AWS_ACCESS_KEY_ID', ]; -local promRuleMinioSLA = common.PromRuleSLA(params.services.vshn.minio.sla, 'VSHNMinio'); +local promRuleMinioSLA = prom.PromRuleSLA(params.services.vshn.minio.sla, 'VSHNMinio'); local minioPlans = common.FilterDisabledParams(minioParams.plans); diff --git a/component/component/vshn_postgres.jsonnet b/component/component/vshn_postgres.jsonnet index bd0269dc8..53f4c35f2 100644 --- a/component/component/vshn_postgres.jsonnet +++ b/component/component/vshn_postgres.jsonnet @@ -1,4 +1,3 @@ -local common = import 'common.libsonnet'; local com = import 'lib/commodore.libjsonnet'; local kap = import 'lib/kapitan.libjsonnet'; local kube = import 'lib/kube.libjsonnet'; @@ -7,9 +6,9 @@ local comp = import 'lib/appcat-compositions.libsonnet'; local crossplane = import 'lib/crossplane.libsonnet'; local common = import 'common.libsonnet'; -local xrds = import 'xrds.libsonnet'; - +local prom = import 'prometheus.libsonnet'; local slos = import 'slos.libsonnet'; +local xrds = import 'xrds.libsonnet'; local inv = kap.inventory(); local params = inv.parameters.appcat; @@ -50,7 +49,7 @@ local xrd = xrds.XRDFromCRD( connectionSecretKeys=connectionSecretKeys, ) + xrds.WithPlanDefaults(pgPlans, pgParams.defaultPlan); -local promRulePostgresSLA = common.PromRuleSLA(params.services.vshn.postgres.sla, 'VSHNPostgreSQL'); +local promRulePostgresSLA = prom.PromRuleSLA(params.services.vshn.postgres.sla, 'VSHNPostgreSQL'); local restoreServiceAccount = kube.ServiceAccount('copyserviceaccount') + { metadata+: { @@ -763,7 +762,7 @@ local clusterRestoreConfig = { }; -local prometheusRule = common.GeneratePrometheusNonSLORules( +local prometheusRule = prom.GeneratePrometheusNonSLORules( 'PostgreSQL', 'patroni', [ @@ -778,7 +777,7 @@ local prometheusRule = common.GeneratePrometheusNonSLORules( summary: 'Connection usage critical', }, - expr: std.strReplace(common.topPod('sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) by (pod, namespace)'), 'vshn-replacemeplease', 'vshn-' + std.asciiLower('PostgreSQL')), + expr: std.strReplace(prom.TopPod('sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) by (pod, namespace)'), 'vshn-replacemeplease', 'vshn-' + std.asciiLower('PostgreSQL')), 'for': '120m', labels: { severity: 'critical', diff --git a/component/component/vshn_redis.jsonnet b/component/component/vshn_redis.jsonnet index ae86753cd..1d2a30cc3 100644 --- a/component/component/vshn_redis.jsonnet +++ b/component/component/vshn_redis.jsonnet @@ -6,9 +6,9 @@ local comp = import 'lib/appcat-compositions.libsonnet'; local crossplane = import 'lib/crossplane.libsonnet'; local common = import 'common.libsonnet'; -local xrds = import 'xrds.libsonnet'; - +local prom = import 'prometheus.libsonnet'; local slos = import 'slos.libsonnet'; +local xrds = import 'xrds.libsonnet'; local inv = kap.inventory(); local params = inv.parameters.appcat; @@ -47,7 +47,7 @@ local xrd = xrds.XRDFromCRD( connectionSecretKeys=connectionSecretKeys, ) + xrds.WithPlanDefaults(redisPlans, redisParams.defaultPlan); -local promRuleRedisSLA = common.PromRuleSLA(params.services.vshn.redis.sla, 'VSHNRedis'); +local promRuleRedisSLA = prom.PromRuleSLA(params.services.vshn.redis.sla, 'VSHNRedis'); local restoreServiceAccount = kube.ServiceAccount('redisrestoreserviceaccount') + { metadata+: { @@ -386,7 +386,7 @@ local composition = }, }; - local prometheusRule = common.GeneratePrometheusNonSLORules('redis', 'redis', []) + { + local prometheusRule = prom.GeneratePrometheusNonSLORules('redis', 'redis', []) + { patches: [ comp.FromCompositeFieldPathWithTransformSuffix('metadata.labels[crossplane.io/composite]', 'metadata.name', 'prometheusrule'), comp.FromCompositeFieldPathWithTransformPrefix('metadata.labels[crossplane.io/composite]', 'spec.forProvider.manifest.metadata.namespace', 'vshn-redis'), diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index 26f28b658..12878be2f 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -1009,6 +1009,7 @@ spec: description: |- The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. Please reducde the load of this instance, or increase the memory. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical summary: Memory usage critical expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index 760fde789..eb74bafcf 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1111,6 +1111,7 @@ spec: description: |- The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. Please reducde the load of this instance, or increase the memory. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical summary: Memory usage critical expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index 61d0d970b..2863eade0 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -146,6 +146,7 @@ spec: description: |- The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. Please reducde the load of this instance, or increase the memory. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical summary: Memory usage critical expr: label_replace( topk(1, (container_memory_working_set_bytes{container="redis"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} diff --git a/docs/modules/ROOT/pages/runbooks/vshn-generic.adoc b/docs/modules/ROOT/pages/runbooks/vshn-generic.adoc new file mode 100644 index 000000000..8c4429718 --- /dev/null +++ b/docs/modules/ROOT/pages/runbooks/vshn-generic.adoc @@ -0,0 +1,9 @@ += Generic alerts + +[[memoryCritical]] +== MemoryCritical + +The instance uses more than 85% of the memory limit set on the pod. +A further increase in memory usage might lead to the pod being OOM-killed by Kubernetes. + +Either adjust the limits of the affected instance or reduce the workload on the instance to lower memory consumption (this depends highly on the used service).