From 70b68d290eeea061907296728e186bcb57f165bb Mon Sep 17 00:00:00 2001 From: "lukasz.widera@vshn.ch" Date: Fri, 3 Nov 2023 14:20:18 +0100 Subject: [PATCH] adding memory and storage alerts for Redis alert --- component/component/common.libsonnet | 108 ++++++++ component/component/vshn_postgres.jsonnet | 230 ++++++---------- component/component/vshn_redis.jsonnet | 14 +- .../appcat/21_composition_vshn_postgres.yaml | 252 +++++++++--------- .../21_composition_vshn_postgresrestore.yaml | 252 +++++++++--------- .../appcat/21_composition_vshn_redis.yaml | 102 +++++++ 6 files changed, 566 insertions(+), 392 deletions(-) diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index 718d8a886..58ecbb229 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -173,6 +173,108 @@ local argoCDAnnotations() = { 'argocd.argoproj.io/sync-options': 'Prune=false', }; +local bottomPod(query) = 'label_replace( bottomk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; +local topPod(query) = 'label_replace( topk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; + +local generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts) = { + // standardized lowercase regardless of what came as input + local serviceNameLower = std.asciiLower(serviceName), + local toReplace = 'vshn-replacemeplease', + name: 'prometheusrule', + base: { + + apiVersion: 'kubernetes.crossplane.io/v1alpha1', + kind: 'Object', + metadata: { + name: 'prometheusrule', + }, + spec: { + providerConfigRef: { + name: 'kubernetes', + }, + forProvider: { + manifest: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + }, + spec+: { + forProvider+: { + manifest+: { + metadata: { + name: '%s-rules' % serviceName, + }, + spec: { + groups: [ + { + name: '%s-general-alerts' % serviceName, + rules: [ + { + name: '%s-storage' % serviceName, + local queries = { + availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', + availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, + usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', + unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', + }, + rules: + [ + { + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), + 'for': '1m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + { + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)), + 'for': '1h', + labels: { + severity: 'warning', + }, + }, + ], + }, + { + alert: serviceName + 'MemoryCritical', + name: std.asciiLower(serviceName) + '-memory', + annotations: { + description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', + // runbook_url: 'TBD', + summary: 'Memory usage critical', + }, + expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 90') % memoryContainerName, toReplace, 'vshn-' + std.asciiLower(serviceName)), + 'for': '120m', + labels: { + severity: 'warning', + syn_team: 'schedar', + }, + }, + ] + additionalAlerts, + }, + ], + }, + }, + }, + }, + }, + }, + }, +}; + + { SyncOptions: syncOptions, VshnMetaDBaaSExoscale(dbname): @@ -203,4 +305,10 @@ local argoCDAnnotations() = { removeField(obj, name), ArgoCDAnnotations(): argoCDAnnotations(), + GeneratePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts): + generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts), + topPod(query): + topPod(query), + bottomPod(query): + bottomPod(query), } diff --git a/component/component/vshn_postgres.jsonnet b/component/component/vshn_postgres.jsonnet index a1fc80a9e..e7b37d56d 100644 --- a/component/component/vshn_postgres.jsonnet +++ b/component/component/vshn_postgres.jsonnet @@ -762,159 +762,91 @@ local clusterRestoreConfig = { ], }; -local prometheusRule = { - name: 'prometheusrule', - base: comp.KubeObject('monitoring.coreos.com/v1', 'PrometheusRule') + { - spec+: { - forProvider+: { - manifest+: { - metadata: { - name: 'postgresql-rules', + +local prometheusRule = common.GeneratePrometheusNonSLORules( + 'PostgreSQL', + 'patroni', + [ + { + name: 'postgresql-connections', + rules: [ + { + alert: 'PostgreSQLConnectionsCritical', + annotations: { + description: 'The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours.\n Please reduce the load of this instance.', + // runbook_url: 'TBD', + summary: 'Connection usage critical', }, - local bottomPod(query) = 'label_replace( bottomk(1, %s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+")' % query, - local topPod(query) = 'label_replace( topk(1, %s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+")' % query, - spec: { - groups: [ - { - name: 'postgresql-storage', - local queries = { - availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', - availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, - usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', - unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', - }, - rules: [ - { - alert: 'PostgreSQLPersistentVolumeFillingUp', - annotations: { - description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), - 'for': '1m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - { - alert: 'PostgreSQLPersistentVolumeFillingUp', - annotations: { - description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), - 'for': '1h', - labels: { - severity: 'warning', - }, - }, - ], - }, - { - name: 'postgresql-memory', - rules: [ - { - alert: 'PostgreSQLMemoryCritical', - annotations: { - description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', - // runbook_url: 'TBD', - summary: 'Memory usage critical', - }, - expr: topPod('(container_memory_working_set_bytes{container="patroni"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 85'), - 'for': '120m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - { - name: 'postgresql-connections', - rules: [ - { - alert: 'PostgreSQLConnectionsCritical', - annotations: { - description: 'The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours.\n Please reduce the load of this instance.', - // runbook_url: 'TBD', - summary: 'Connection usage critical', - }, - expr: topPod('sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) by (pod, namespace)'), - 'for': '120m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - // new - { - name: 'postgresql-replication', - rules: [ - { - alert: 'PostgreSQLReplicationCritical', - annotations: { - description: 'The number of replicas for the instance {{ $labels.cluster_name }} in namespace {{ $labels.namespace }}. Please check pod counts in affected namespace.', - runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical', - summary: 'Replication status check', - }, - expr: 'pg_replication_slots_active == 0', - 'for': '10m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - { - name: 'postgresql-replication-lag', - rules: [ - { - alert: 'PostgreSQLReplicationLagCritical', - annotations: { - description: 'Replication lag size on namespace {{$labels.exported_namespace}} instance ({{$labels.application_name}}) is currently {{ $value | humanize1024}}B behind the leader.', - runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical', - summary: 'Replication lag status check', - }, - expr: 'pg_replication_status_lag_size > 1e+09', - 'for': '5m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - { - name: 'postgresql-replication-count', - rules: [ - { - alert: 'PostgreSQLPodReplicasCritical', - annotations: { - description: 'Replication is broken in namespace {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}).', - runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical', - summary: 'Replication lag status check', - }, - expr: 'kube_statefulset_status_replicas_available{statefulset=~".+", namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"}', - 'for': '5m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - ], + + expr: std.strReplace(common.topPod('sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) by (pod, namespace)'), 'vshn-replacemeplease', 'vshn-' + std.asciiLower('PostgreSQL')), + 'for': '120m', + labels: { + severity: 'critical', + syn_team: 'schedar', }, }, - }, + ], }, - }, + // new + { + name: 'postgresql-replication', + rules: [ + { + alert: 'PostgreSQLReplicationCritical', + annotations: { + description: 'The number of replicas for the instance {{ $labels.cluster_name }} in namespace {{ $labels.namespace }}. Please check pod counts in affected namespace.', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical', + summary: 'Replication status check', + }, + expr: 'pg_replication_slots_active == 0', + 'for': '10m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + ], + }, + { + name: 'postgresql-replication-lag', + rules: [ + { + alert: 'PostgreSQLReplicationLagCritical', + annotations: { + description: 'Replication lag size on namespace {{$labels.exported_namespace}} instance ({{$labels.application_name}}) is currently {{ $value | humanize1024}}B behind the leader.', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical', + summary: 'Replication lag status check', + }, + expr: 'pg_replication_status_lag_size > 1e+09', + 'for': '5m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + ], + }, + { + name: 'postgresql-replication-count', + rules: [ + { + alert: 'PostgreSQLPodReplicasCritical', + annotations: { + description: 'Replication is broken in namespace {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}).', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical', + summary: 'Replication lag status check', + }, + expr: 'kube_statefulset_status_replicas_available{statefulset=~".+", namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"}', + 'for': '5m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + ], + }, + ] +) + { patches: [ comp.FromCompositeFieldPathWithTransformSuffix('metadata.labels[crossplane.io/composite]', 'metadata.name', 'prometheusrule'), comp.FromCompositeFieldPathWithTransformPrefix('metadata.labels[crossplane.io/composite]', 'spec.forProvider.manifest.metadata.namespace', 'vshn-postgresql'), diff --git a/component/component/vshn_redis.jsonnet b/component/component/vshn_redis.jsonnet index c48f28989..074ef3b66 100644 --- a/component/component/vshn_redis.jsonnet +++ b/component/component/vshn_redis.jsonnet @@ -386,6 +386,14 @@ local composition = }, }; + local prometheusRule = common.GeneratePrometheusNonSLORules('redis', 'redis', []) + { + patches: [ + comp.FromCompositeFieldPathWithTransformSuffix('metadata.labels[crossplane.io/composite]', 'metadata.name', 'prometheusrule'), + comp.FromCompositeFieldPathWithTransformPrefix('metadata.labels[crossplane.io/composite]', 'spec.forProvider.manifest.metadata.namespace', 'vshn-redis'), + ], + }; + + local redisHelmChart = { apiVersion: 'helm.crossplane.io/v1beta1', @@ -407,7 +415,10 @@ local composition = { name: 'REDIS_EXPORTER_SKIP_TLS_VERIFICATION', value: 'true', - + }, + { + name: 'REDIS_EXPORTER_INCL_SYSTEM_METRICS', + value: 'true', }, ], containerSecurityContext: { @@ -537,6 +548,7 @@ local composition = comp.ToCompositeFieldPath('status.atProvider.manifest.metadata.labels[appuio.io/organization]', 'metadata.labels[appuio.io/organization]'), ], }, + prometheusRule, { name: 'namespace-conditions', base: namespace, diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index ca4805619..6cfde23af 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -949,132 +949,142 @@ spec: - base: apiVersion: kubernetes.crossplane.io/v1alpha1 kind: Object - metadata: {} + metadata: + name: prometheusrule spec: forProvider: manifest: apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule - metadata: - name: postgresql-rules - spec: - groups: - - name: postgresql-storage - rules: - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: The volume claimed by the instance {{ $labels.name - }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} - == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1m - labels: - severity: critical - syn_team: schedar - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the volume claimed - by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is expected to fill up within four days. Currently - {{ $value | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless - on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1h - labels: - severity: warning - - name: postgresql-memory - rules: - - alert: PostgreSQLMemoryCritical - annotations: - description: |- - The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. - Please reducde the load of this instance, or increase the memory. - summary: Memory usage critical - expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / - on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} - * 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-connections - rules: - - alert: PostgreSQLConnectionsCritical - annotations: - description: |- - The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. - Please reduce the load of this instance. - summary: Connection usage critical - expr: label_replace( topk(1, sum(pg_stat_activity_count) by - (pod, namespace) > 90/100 * sum(pg_settings_max_connections) - by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication - rules: - - alert: PostgreSQLReplicationCritical - annotations: - description: The number of replicas for the instance {{ - $labels.cluster_name }} in namespace {{ $labels.namespace - }}. Please check pod counts in affected namespace. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical - summary: Replication status check - expr: pg_replication_slots_active == 0 - for: 10m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-lag - rules: - - alert: PostgreSQLReplicationLagCritical - annotations: - description: Replication lag size on namespace {{$labels.exported_namespace}} - instance ({{$labels.application_name}}) is currently {{ - $value | humanize1024}}B behind the leader. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical - summary: Replication lag status check - expr: pg_replication_status_lag_size > 1e+09 - for: 5m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-count - rules: - - alert: PostgreSQLPodReplicasCritical - annotations: - description: Replication is broken in namespace {{$labels.namespace}}, - check statefulset ({{$labels.statefulset}}). - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical - summary: Replication lag status check - expr: kube_statefulset_status_replicas_available{statefulset=~".+", - namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} - for: 5m - labels: - severity: critical - syn_team: schedar + spec: + forProvider: + manifest: + metadata: + name: PostgreSQL-rules + spec: + groups: + - name: PostgreSQL-general-alerts + rules: + - name: PostgreSQL-storage + rules: + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance + {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume + claimed by the instance {{ $labels.name }} in + namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is expected to fill up within four days. Currently + {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 1h + labels: + severity: warning + - alert: PostgreSQLMemoryCritical + annotations: + description: |- + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + Please reducde the load of this instance, or increase the memory. + summary: Memory usage critical + expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / + on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} + * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: warning + syn_team: schedar + name: postgresql-memory + - name: postgresql-connections + rules: + - alert: PostgreSQLConnectionsCritical + annotations: + description: |- + The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. + Please reduce the load of this instance. + summary: Connection usage critical + expr: label_replace( topk(1, sum(pg_stat_activity_count) + by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) + by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication + rules: + - alert: PostgreSQLReplicationCritical + annotations: + description: The number of replicas for the instance + {{ $labels.cluster_name }} in namespace {{ $labels.namespace + }}. Please check pod counts in affected namespace. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical + summary: Replication status check + expr: pg_replication_slots_active == 0 + for: 10m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-lag + rules: + - alert: PostgreSQLReplicationLagCritical + annotations: + description: Replication lag size on namespace {{$labels.exported_namespace}} + instance ({{$labels.application_name}}) is currently + {{ $value | humanize1024}}B behind the leader. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical + summary: Replication lag status check + expr: pg_replication_status_lag_size > 1e+09 + for: 5m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-count + rules: + - alert: PostgreSQLPodReplicasCritical + annotations: + description: Replication is broken in namespace + {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}). + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical + summary: Replication lag status check + expr: kube_statefulset_status_replicas_available{statefulset=~".+", + namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} + for: 5m + labels: + severity: critical + syn_team: schedar providerConfigRef: name: kubernetes name: prometheusrule diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index b43d4e118..0a5995b6c 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1051,132 +1051,142 @@ spec: - base: apiVersion: kubernetes.crossplane.io/v1alpha1 kind: Object - metadata: {} + metadata: + name: prometheusrule spec: forProvider: manifest: apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule - metadata: - name: postgresql-rules - spec: - groups: - - name: postgresql-storage - rules: - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: The volume claimed by the instance {{ $labels.name - }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} - == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1m - labels: - severity: critical - syn_team: schedar - - alert: PostgreSQLPersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the volume claimed - by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace - }} is expected to fill up within four days. Currently - {{ $value | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", - metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", - metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", - metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless - on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ - access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} - == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 1h - labels: - severity: warning - - name: postgresql-memory - rules: - - alert: PostgreSQLMemoryCritical - annotations: - description: |- - The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. - Please reducde the load of this instance, or increase the memory. - summary: Memory usage critical - expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / - on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} - * 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-connections - rules: - - alert: PostgreSQLConnectionsCritical - annotations: - description: |- - The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. - Please reduce the load of this instance. - summary: Connection usage critical - expr: label_replace( topk(1, sum(pg_stat_activity_count) by - (pod, namespace) > 90/100 * sum(pg_settings_max_connections) - by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) - kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") - for: 120m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication - rules: - - alert: PostgreSQLReplicationCritical - annotations: - description: The number of replicas for the instance {{ - $labels.cluster_name }} in namespace {{ $labels.namespace - }}. Please check pod counts in affected namespace. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical - summary: Replication status check - expr: pg_replication_slots_active == 0 - for: 10m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-lag - rules: - - alert: PostgreSQLReplicationLagCritical - annotations: - description: Replication lag size on namespace {{$labels.exported_namespace}} - instance ({{$labels.application_name}}) is currently {{ - $value | humanize1024}}B behind the leader. - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical - summary: Replication lag status check - expr: pg_replication_status_lag_size > 1e+09 - for: 5m - labels: - severity: critical - syn_team: schedar - - name: postgresql-replication-count - rules: - - alert: PostgreSQLPodReplicasCritical - annotations: - description: Replication is broken in namespace {{$labels.namespace}}, - check statefulset ({{$labels.statefulset}}). - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical - summary: Replication lag status check - expr: kube_statefulset_status_replicas_available{statefulset=~".+", - namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} - for: 5m - labels: - severity: critical - syn_team: schedar + spec: + forProvider: + manifest: + metadata: + name: PostgreSQL-rules + spec: + groups: + - name: PostgreSQL-general-alerts + rules: + - name: PostgreSQL-storage + rules: + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance + {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + - alert: PostgreSQLPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume + claimed by the instance {{ $labels.name }} in + namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is expected to fill up within four days. Currently + {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 1h + labels: + severity: warning + - alert: PostgreSQLMemoryCritical + annotations: + description: |- + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + Please reducde the load of this instance, or increase the memory. + summary: Memory usage critical + expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / + on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} + * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: warning + syn_team: schedar + name: postgresql-memory + - name: postgresql-connections + rules: + - alert: PostgreSQLConnectionsCritical + annotations: + description: |- + The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. + Please reduce the load of this instance. + summary: Connection usage critical + expr: label_replace( topk(1, sum(pg_stat_activity_count) + by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) + by (pod, namespace)) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-postgresql-(.+)-.+") + for: 120m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication + rules: + - alert: PostgreSQLReplicationCritical + annotations: + description: The number of replicas for the instance + {{ $labels.cluster_name }} in namespace {{ $labels.namespace + }}. Please check pod counts in affected namespace. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical + summary: Replication status check + expr: pg_replication_slots_active == 0 + for: 10m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-lag + rules: + - alert: PostgreSQLReplicationLagCritical + annotations: + description: Replication lag size on namespace {{$labels.exported_namespace}} + instance ({{$labels.application_name}}) is currently + {{ $value | humanize1024}}B behind the leader. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical + summary: Replication lag status check + expr: pg_replication_status_lag_size > 1e+09 + for: 5m + labels: + severity: critical + syn_team: schedar + - name: postgresql-replication-count + rules: + - alert: PostgreSQLPodReplicasCritical + annotations: + description: Replication is broken in namespace + {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}). + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical + summary: Replication lag status check + expr: kube_statefulset_status_replicas_available{statefulset=~".+", + namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"} + for: 5m + labels: + severity: critical + syn_team: schedar providerConfigRef: name: kubernetes name: prometheusrule diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index b4c739ee6..bd27d97e7 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -83,6 +83,106 @@ spec: - fromFieldPath: status.atProvider.manifest.metadata.labels[appuio.io/organization] toFieldPath: metadata.labels[appuio.io/organization] type: ToCompositeFieldPath + - base: + apiVersion: kubernetes.crossplane.io/v1alpha1 + kind: Object + metadata: + name: prometheusrule + spec: + forProvider: + manifest: + apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule + spec: + forProvider: + manifest: + metadata: + name: redis-rules + spec: + groups: + - name: redis-general-alerts + rules: + - name: redis-storage + rules: + - alert: redisPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance + {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-redis-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + - alert: redisPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume + claimed by the instance {{ $labels.name }} in + namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is expected to fill up within four days. Currently + {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, + persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", + "vshn-redis-(.+)-.+") + for: 1h + labels: + severity: warning + - alert: redisMemoryCritical + annotations: + description: |- + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours. + Please reducde the load of this instance, or increase the memory. + summary: Memory usage critical + expr: label_replace( topk(1, (container_memory_working_set_bytes{container="redis"} / + on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} + * 100) > 90) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") + for: 120m + labels: + severity: warning + syn_team: schedar + name: redis-memory + providerConfigRef: + name: kubernetes + name: prometheusrule + patches: + - fromFieldPath: metadata.labels[crossplane.io/composite] + toFieldPath: metadata.name + transforms: + - string: + fmt: '%s-prometheusrule' + type: Format + type: string + type: FromCompositeFieldPath + - fromFieldPath: metadata.labels[crossplane.io/composite] + toFieldPath: spec.forProvider.manifest.metadata.namespace + transforms: + - string: + fmt: vshn-redis-%s + type: Format + type: string + type: FromCompositeFieldPath - base: apiVersion: kubernetes.crossplane.io/v1alpha1 kind: Object @@ -706,6 +806,8 @@ spec: extraEnvVars: - name: REDIS_EXPORTER_SKIP_TLS_VERIFICATION value: 'true' + - name: REDIS_EXPORTER_INCL_SYSTEM_METRICS + value: 'true' serviceMonitor: enabled: true namespace: ''