diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index 718d8a886..ca71d9e35 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -134,31 +134,6 @@ local getAppCatImageString() = params.images.appcat.registry + '/' + params.imag local getApiserverImageString() = params.images.apiserver.registry + '/' + params.images.apiserver.repository + ':' + getApiserverImageTag(); -local promRuleSLA(value, service) = kube._Object('monitoring.coreos.com/v1', 'PrometheusRule', 'vshn-' + std.asciiLower(service) + '-sla') { - metadata+: { - labels: { - name: 'vshn-' + std.asciiLower(service) + '-sla', - }, - namespace: params.slos.namespace, - }, - spec: { - groups: [ - { - name: 'appcat-' + std.asciiLower(service) + '-sla-target', - rules: [ - { - expr: 'vector(' + value + ')', - labels: { - service: service, - }, - record: 'sla:objective:ratio', - }, - ], - }, - ], - }, -}; - local removeField(obj, name) = { // We don't want the name field in the actual providerConfig [k]: obj[k] @@ -173,6 +148,7 @@ local argoCDAnnotations() = { 'argocd.argoproj.io/sync-options': 'Prune=false', }; + { SyncOptions: syncOptions, VshnMetaDBaaSExoscale(dbname): @@ -197,8 +173,6 @@ local argoCDAnnotations() = { getApiserverImageTag(), GetApiserverImageString(): getApiserverImageString(), - PromRuleSLA(value, service): - promRuleSLA(value, service), RemoveField(obj, name): removeField(obj, name), ArgoCDAnnotations(): diff --git a/component/component/prometheus.libsonnet b/component/component/prometheus.libsonnet new file mode 100644 index 000000000..13f08899b --- /dev/null +++ b/component/component/prometheus.libsonnet @@ -0,0 +1,136 @@ +local kap = import 'lib/kapitan.libjsonnet'; +local kube = import 'lib/kube.libjsonnet'; + +local inv = kap.inventory(); +local params = inv.parameters.appcat; + + +local promRuleSLA(value, service) = kube._Object('monitoring.coreos.com/v1', 'PrometheusRule', 'vshn-' + std.asciiLower(service) + '-sla') { + metadata+: { + labels: { + name: 'vshn-' + std.asciiLower(service) + '-sla', + }, + namespace: params.slos.namespace, + }, + spec: { + groups: [ + { + name: 'appcat-' + std.asciiLower(service) + '-sla-target', + rules: [ + { + expr: 'vector(' + value + ')', + labels: { + service: service, + }, + record: 'sla:objective:ratio', + }, + ], + }, + ], + }, +}; + +local bottomPod(query) = 'label_replace( bottomk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; +local topPod(query) = 'label_replace( topk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query; + +local generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup) = { + // standardized lowercase regardless of what came as input + local serviceNameLower = std.asciiLower(serviceName), + local toReplace = 'vshn-replacemeplease', + local queries = { + availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', + availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, + usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', + unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', + }, + name: 'prometheusrule', + base: { + + apiVersion: 'kubernetes.crossplane.io/v1alpha1', + kind: 'Object', + metadata: { + name: 'prometheusrule', + }, + spec: { + providerConfigRef: { + name: 'kubernetes', + }, + forProvider+: { + manifest+: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: '%s-rules' % serviceNameLower, + }, + spec: { + groups: [ + { + name: '%s-storage' % serviceNameLower, + rules: [ + { + + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower), + 'for': '1m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + { + alert: serviceName + 'PersistentVolumeFillingUp', + annotations: { + description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', + runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', + summary: 'PersistentVolume is filling up.', + }, + expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower), + 'for': '1h', + labels: { + severity: 'warning', + }, + }, + ], + }, + { + name: std.asciiLower(serviceName) + '-memory', + rules: [ + { + alert: serviceName + 'MemoryCritical', + annotations: { + description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical', + summary: 'Memory usage critical', + }, + expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 85') % memoryContainerName, toReplace, 'vshn-' + serviceNameLower), + 'for': '120m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + ], + }, + ] + additionalAlertsRuleGroup, + }, + }, + }, + }, + }, +}; + +{ + GeneratePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup): + generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup), + PromRuleSLA(value, service): + promRuleSLA(value, service), + TopPod(query): + topPod(query), + BottomPod(query): + bottomPod(query), +} diff --git a/component/component/vshn_minio.jsonnet b/component/component/vshn_minio.jsonnet index 6649c40be..b3b94f3ef 100644 --- a/component/component/vshn_minio.jsonnet +++ b/component/component/vshn_minio.jsonnet @@ -6,9 +6,9 @@ local comp = import 'lib/appcat-compositions.libsonnet'; local crossplane = import 'lib/crossplane.libsonnet'; local common = import 'common.libsonnet'; -local xrds = import 'xrds.libsonnet'; - +local prom = import 'prometheus.libsonnet'; local slos = import 'slos.libsonnet'; +local xrds = import 'xrds.libsonnet'; local inv = kap.inventory(); local params = inv.parameters.appcat; @@ -23,7 +23,7 @@ local connectionSecretKeys = [ 'AWS_ACCESS_KEY_ID', ]; -local promRuleMinioSLA = common.PromRuleSLA(params.services.vshn.minio.sla, 'VSHNMinio'); +local promRuleMinioSLA = prom.PromRuleSLA(params.services.vshn.minio.sla, 'VSHNMinio'); local minioPlans = common.FilterDisabledParams(minioParams.plans); diff --git a/component/component/vshn_postgres.jsonnet b/component/component/vshn_postgres.jsonnet index a1fc80a9e..53f4c35f2 100644 --- a/component/component/vshn_postgres.jsonnet +++ b/component/component/vshn_postgres.jsonnet @@ -1,4 +1,3 @@ -local common = import 'common.libsonnet'; local com = import 'lib/commodore.libjsonnet'; local kap = import 'lib/kapitan.libjsonnet'; local kube = import 'lib/kube.libjsonnet'; @@ -7,9 +6,9 @@ local comp = import 'lib/appcat-compositions.libsonnet'; local crossplane = import 'lib/crossplane.libsonnet'; local common = import 'common.libsonnet'; -local xrds = import 'xrds.libsonnet'; - +local prom = import 'prometheus.libsonnet'; local slos = import 'slos.libsonnet'; +local xrds = import 'xrds.libsonnet'; local inv = kap.inventory(); local params = inv.parameters.appcat; @@ -50,7 +49,7 @@ local xrd = xrds.XRDFromCRD( connectionSecretKeys=connectionSecretKeys, ) + xrds.WithPlanDefaults(pgPlans, pgParams.defaultPlan); -local promRulePostgresSLA = common.PromRuleSLA(params.services.vshn.postgres.sla, 'VSHNPostgreSQL'); +local promRulePostgresSLA = prom.PromRuleSLA(params.services.vshn.postgres.sla, 'VSHNPostgreSQL'); local restoreServiceAccount = kube.ServiceAccount('copyserviceaccount') + { metadata+: { @@ -762,159 +761,90 @@ local clusterRestoreConfig = { ], }; -local prometheusRule = { - name: 'prometheusrule', - base: comp.KubeObject('monitoring.coreos.com/v1', 'PrometheusRule') + { - spec+: { - forProvider+: { - manifest+: { - metadata: { - name: 'postgresql-rules', + +local prometheusRule = prom.GeneratePrometheusNonSLORules( + 'PostgreSQL', + 'patroni', + [ + { + name: 'postgresql-connections', + rules: [ + { + alert: 'PostgreSQLConnectionsCritical', + annotations: { + description: 'The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours.\n Please reduce the load of this instance.', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLConnectionsCritical', + summary: 'Connection usage critical', }, - local bottomPod(query) = 'label_replace( bottomk(1, %s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+")' % query, - local topPod(query) = 'label_replace( topk(1, %s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+")' % query, - spec: { - groups: [ - { - name: 'postgresql-storage', - local queries = { - availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}', - availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage, - usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}', - unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1', - }, - rules: [ - { - alert: 'PostgreSQLPersistentVolumeFillingUp', - annotations: { - description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), - 'for': '1m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - { - alert: 'PostgreSQLPersistentVolumeFillingUp', - annotations: { - description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.', - runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup', - summary: 'PersistentVolume is filling up.', - }, - expr: bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), - 'for': '1h', - labels: { - severity: 'warning', - }, - }, - ], - }, - { - name: 'postgresql-memory', - rules: [ - { - alert: 'PostgreSQLMemoryCritical', - annotations: { - description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', - // runbook_url: 'TBD', - summary: 'Memory usage critical', - }, - expr: topPod('(container_memory_working_set_bytes{container="patroni"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 85'), - 'for': '120m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - { - name: 'postgresql-connections', - rules: [ - { - alert: 'PostgreSQLConnectionsCritical', - annotations: { - description: 'The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours.\n Please reduce the load of this instance.', - // runbook_url: 'TBD', - summary: 'Connection usage critical', - }, - expr: topPod('sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) by (pod, namespace)'), - 'for': '120m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - // new - { - name: 'postgresql-replication', - rules: [ - { - alert: 'PostgreSQLReplicationCritical', - annotations: { - description: 'The number of replicas for the instance {{ $labels.cluster_name }} in namespace {{ $labels.namespace }}. Please check pod counts in affected namespace.', - runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical', - summary: 'Replication status check', - }, - expr: 'pg_replication_slots_active == 0', - 'for': '10m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - { - name: 'postgresql-replication-lag', - rules: [ - { - alert: 'PostgreSQLReplicationLagCritical', - annotations: { - description: 'Replication lag size on namespace {{$labels.exported_namespace}} instance ({{$labels.application_name}}) is currently {{ $value | humanize1024}}B behind the leader.', - runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical', - summary: 'Replication lag status check', - }, - expr: 'pg_replication_status_lag_size > 1e+09', - 'for': '5m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - { - name: 'postgresql-replication-count', - rules: [ - { - alert: 'PostgreSQLPodReplicasCritical', - annotations: { - description: 'Replication is broken in namespace {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}).', - runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical', - summary: 'Replication lag status check', - }, - expr: 'kube_statefulset_status_replicas_available{statefulset=~".+", namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"}', - 'for': '5m', - labels: { - severity: 'critical', - syn_team: 'schedar', - }, - }, - ], - }, - ], + + expr: std.strReplace(prom.TopPod('sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) by (pod, namespace)'), 'vshn-replacemeplease', 'vshn-' + std.asciiLower('PostgreSQL')), + 'for': '120m', + labels: { + severity: 'critical', + syn_team: 'schedar', }, }, - }, + ], }, - }, + { + name: 'postgresql-replication', + rules: [ + { + alert: 'PostgreSQLReplicationCritical', + annotations: { + description: 'The number of replicas for the instance {{ $labels.cluster_name }} in namespace {{ $labels.namespace }}. Please check pod counts in affected namespace.', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical', + summary: 'Replication status check', + }, + expr: 'pg_replication_slots_active == 0', + 'for': '10m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + ], + }, + { + name: 'postgresql-replication-lag', + rules: [ + { + alert: 'PostgreSQLReplicationLagCritical', + annotations: { + description: 'Replication lag size on namespace {{$labels.exported_namespace}} instance ({{$labels.application_name}}) is currently {{ $value | humanize1024}}B behind the leader.', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical', + summary: 'Replication lag status check', + }, + expr: 'pg_replication_status_lag_size > 1e+09', + 'for': '5m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + ], + }, + { + name: 'postgresql-replication-count', + rules: [ + { + alert: 'PostgreSQLPodReplicasCritical', + annotations: { + description: 'Replication is broken in namespace {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}).', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical', + summary: 'Replication lag status check', + }, + expr: 'kube_statefulset_status_replicas_available{statefulset=~".+", namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"}', + 'for': '5m', + labels: { + severity: 'critical', + syn_team: 'schedar', + }, + }, + ], + }, + ] +) + { patches: [ comp.FromCompositeFieldPathWithTransformSuffix('metadata.labels[crossplane.io/composite]', 'metadata.name', 'prometheusrule'), comp.FromCompositeFieldPathWithTransformPrefix('metadata.labels[crossplane.io/composite]', 'spec.forProvider.manifest.metadata.namespace', 'vshn-postgresql'), diff --git a/component/component/vshn_redis.jsonnet b/component/component/vshn_redis.jsonnet index 6c2851c1b..1d2a30cc3 100644 --- a/component/component/vshn_redis.jsonnet +++ b/component/component/vshn_redis.jsonnet @@ -6,9 +6,9 @@ local comp = import 'lib/appcat-compositions.libsonnet'; local crossplane = import 'lib/crossplane.libsonnet'; local common = import 'common.libsonnet'; -local xrds = import 'xrds.libsonnet'; - +local prom = import 'prometheus.libsonnet'; local slos = import 'slos.libsonnet'; +local xrds = import 'xrds.libsonnet'; local inv = kap.inventory(); local params = inv.parameters.appcat; @@ -47,7 +47,7 @@ local xrd = xrds.XRDFromCRD( connectionSecretKeys=connectionSecretKeys, ) + xrds.WithPlanDefaults(redisPlans, redisParams.defaultPlan); -local promRuleRedisSLA = common.PromRuleSLA(params.services.vshn.redis.sla, 'VSHNRedis'); +local promRuleRedisSLA = prom.PromRuleSLA(params.services.vshn.redis.sla, 'VSHNRedis'); local restoreServiceAccount = kube.ServiceAccount('redisrestoreserviceaccount') + { metadata+: { @@ -386,6 +386,14 @@ local composition = }, }; + local prometheusRule = prom.GeneratePrometheusNonSLORules('redis', 'redis', []) + { + patches: [ + comp.FromCompositeFieldPathWithTransformSuffix('metadata.labels[crossplane.io/composite]', 'metadata.name', 'prometheusrule'), + comp.FromCompositeFieldPathWithTransformPrefix('metadata.labels[crossplane.io/composite]', 'spec.forProvider.manifest.metadata.namespace', 'vshn-redis'), + ], + }; + + local redisHelmChart = { apiVersion: 'helm.crossplane.io/v1beta1', @@ -407,7 +415,10 @@ local composition = { name: 'REDIS_EXPORTER_SKIP_TLS_VERIFICATION', value: 'true', - + }, + { + name: 'REDIS_EXPORTER_INCL_SYSTEM_METRICS', + value: 'true', }, ], containerSecurityContext: { @@ -537,6 +548,7 @@ local composition = comp.ToCompositeFieldPath('status.atProvider.manifest.metadata.labels[appuio.io/organization]', 'metadata.labels[appuio.io/organization]'), ], }, + prometheusRule, { name: 'namespace-conditions', base: namespace, diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index d5a680374..12878be2f 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -949,7 +949,8 @@ spec: - base: apiVersion: kubernetes.crossplane.io/v1alpha1 kind: Object - metadata: {} + metadata: + name: prometheusrule spec: forProvider: manifest: @@ -1008,6 +1009,7 @@ spec: description: |- The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. Please reducde the load of this instance, or increase the memory. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical summary: Memory usage critical expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} @@ -1024,6 +1026,7 @@ spec: description: |- The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. Please reduce the load of this instance. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLConnectionsCritical summary: Connection usage critical expr: label_replace( topk(1, sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index ee6e7fda2..eb74bafcf 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1051,7 +1051,8 @@ spec: - base: apiVersion: kubernetes.crossplane.io/v1alpha1 kind: Object - metadata: {} + metadata: + name: prometheusrule spec: forProvider: manifest: @@ -1110,6 +1111,7 @@ spec: description: |- The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. Please reducde the load of this instance, or increase the memory. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical summary: Memory usage critical expr: label_replace( topk(1, (container_memory_working_set_bytes{container="patroni"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} @@ -1126,6 +1128,7 @@ spec: description: |- The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. Please reduce the load of this instance. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLConnectionsCritical summary: Connection usage critical expr: label_replace( topk(1, sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index 7008a1f42..2863eade0 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -83,6 +83,99 @@ spec: - fromFieldPath: status.atProvider.manifest.metadata.labels[appuio.io/organization] toFieldPath: metadata.labels[appuio.io/organization] type: ToCompositeFieldPath + - base: + apiVersion: kubernetes.crossplane.io/v1alpha1 + kind: Object + metadata: + name: prometheusrule + spec: + forProvider: + manifest: + apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule + metadata: + name: redis-rules + spec: + groups: + - name: redis-storage + rules: + - alert: redisPersistentVolumeFillingUp + annotations: + description: The volume claimed by the instance {{ $labels.name + }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} + == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") + for: 1m + labels: + severity: critical + syn_team: schedar + - alert: redisPersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the volume claimed + by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace + }} is expected to fill up within four days. Currently + {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", + metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet", + metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", + metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless + on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ + access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} + == 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") + for: 1h + labels: + severity: warning + - name: redis-memory + rules: + - alert: redisMemoryCritical + annotations: + description: |- + The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours. + Please reducde the load of this instance, or increase the memory. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical + summary: Memory usage critical + expr: label_replace( topk(1, (container_memory_working_set_bytes{container="redis"} / + on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} + * 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) + kube_namespace_labels, "name", "$1", "namespace", "vshn-redis-(.+)-.+") + for: 120m + labels: + severity: critical + syn_team: schedar + providerConfigRef: + name: kubernetes + name: prometheusrule + patches: + - fromFieldPath: metadata.labels[crossplane.io/composite] + toFieldPath: metadata.name + transforms: + - string: + fmt: '%s-prometheusrule' + type: Format + type: string + type: FromCompositeFieldPath + - fromFieldPath: metadata.labels[crossplane.io/composite] + toFieldPath: spec.forProvider.manifest.metadata.namespace + transforms: + - string: + fmt: vshn-redis-%s + type: Format + type: string + type: FromCompositeFieldPath - base: apiVersion: kubernetes.crossplane.io/v1alpha1 kind: Object @@ -706,6 +799,8 @@ spec: extraEnvVars: - name: REDIS_EXPORTER_SKIP_TLS_VERIFICATION value: 'true' + - name: REDIS_EXPORTER_INCL_SYSTEM_METRICS + value: 'true' serviceMonitor: enabled: true namespace: '' diff --git a/docs/modules/ROOT/pages/runbooks/vshn-generic.adoc b/docs/modules/ROOT/pages/runbooks/vshn-generic.adoc new file mode 100644 index 000000000..8c4429718 --- /dev/null +++ b/docs/modules/ROOT/pages/runbooks/vshn-generic.adoc @@ -0,0 +1,9 @@ += Generic alerts + +[[memoryCritical]] +== MemoryCritical + +The instance uses more than 85% of the memory limit set on the pod. +A further increase in memory usage might lead to the pod being OOM-killed by Kubernetes. + +Either adjust the limits of the affected instance or reduce the workload on the instance to lower memory consumption (this depends highly on the used service). diff --git a/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc b/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc index b9d9ed15a..147cc6812 100644 --- a/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc +++ b/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc @@ -167,8 +167,15 @@ This alert fires when there are issues with statefullset responsible for replica ``` kubectl describe -n vshn-postgresql- sts -## for exmaple: kubectl -n vshn-postgresql-test-cluster-always-true-jnlj4 describe sts test-cluster-always-true-jnlj4 +## for example: kubectl -n vshn-postgresql-test-cluster-always-true-jnlj4 describe sts test-cluster-always-true-jnlj4 ## get events from affected namespace and look for issues k -n vshn-postgresql-test-cluster-always-true-jnlj4 get events ``` + +[[PostgreSQLConnectionsCritical]] +== PostgreSQLConnectionsCritical + +This alert fires when the used connection is over 90% of the configured `max_connections` limit (defaults to 100). +It means that either the connection limit is set too low or an application is misbehaving and spawning too many connections. +You either need to raise the `max_connections` parameter on the PostgreSQL instance or debug the application, as it might be misbehaving and spawning too many connections.