Skip to content

Commit

Permalink
adding memory and storage alerts for Redis alert
Browse files Browse the repository at this point in the history
  • Loading branch information
lukasz.widera@vshn.ch committed Nov 14, 2023
1 parent 0090986 commit 70b68d2
Show file tree
Hide file tree
Showing 6 changed files with 566 additions and 392 deletions.
108 changes: 108 additions & 0 deletions component/component/common.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,108 @@ local argoCDAnnotations() = {
'argocd.argoproj.io/sync-options': 'Prune=false',
};

local bottomPod(query) = 'label_replace( bottomk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query;
local topPod(query) = 'label_replace( topk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query;

local generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts) = {
// standardized lowercase regardless of what came as input
local serviceNameLower = std.asciiLower(serviceName),
local toReplace = 'vshn-replacemeplease',
name: 'prometheusrule',
base: {

apiVersion: 'kubernetes.crossplane.io/v1alpha1',
kind: 'Object',
metadata: {
name: 'prometheusrule',
},
spec: {
providerConfigRef: {
name: 'kubernetes',
},
forProvider: {
manifest: {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'PrometheusRule',
},
spec+: {
forProvider+: {
manifest+: {
metadata: {
name: '%s-rules' % serviceName,
},
spec: {
groups: [
{
name: '%s-general-alerts' % serviceName,
rules: [
{
name: '%s-storage' % serviceName,
local queries = {
availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}',
availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage,
usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}',
unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1',
},
rules:
[
{
alert: serviceName + 'PersistentVolumeFillingUp',
annotations: {
description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.',
runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup',
summary: 'PersistentVolume is filling up.',
},
expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)),
'for': '1m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
{
alert: serviceName + 'PersistentVolumeFillingUp',
annotations: {
description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.',
runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup',
summary: 'PersistentVolume is filling up.',
},
expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + std.asciiLower(serviceName)),
'for': '1h',
labels: {
severity: 'warning',
},
},
],
},
{
alert: serviceName + 'MemoryCritical',
name: std.asciiLower(serviceName) + '-memory',
annotations: {
description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 90% for 2 hours.\n Please reducde the load of this instance, or increase the memory.',
// runbook_url: 'TBD',
summary: 'Memory usage critical',
},
expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 90') % memoryContainerName, toReplace, 'vshn-' + std.asciiLower(serviceName)),
'for': '120m',
labels: {
severity: 'warning',
syn_team: 'schedar',
},
},
] + additionalAlerts,
},
],
},
},
},
},
},
},
},
};


{
SyncOptions: syncOptions,
VshnMetaDBaaSExoscale(dbname):
Expand Down Expand Up @@ -203,4 +305,10 @@ local argoCDAnnotations() = {
removeField(obj, name),
ArgoCDAnnotations():
argoCDAnnotations(),
GeneratePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts):
generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlerts),
topPod(query):
topPod(query),
bottomPod(query):
bottomPod(query),
}
230 changes: 81 additions & 149 deletions component/component/vshn_postgres.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -762,159 +762,91 @@ local clusterRestoreConfig = {
],
};

local prometheusRule = {
name: 'prometheusrule',
base: comp.KubeObject('monitoring.coreos.com/v1', 'PrometheusRule') + {
spec+: {
forProvider+: {
manifest+: {
metadata: {
name: 'postgresql-rules',

local prometheusRule = common.GeneratePrometheusNonSLORules(
'PostgreSQL',
'patroni',
[
{
name: 'postgresql-connections',
rules: [
{
alert: 'PostgreSQLConnectionsCritical',
annotations: {
description: 'The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours.\n Please reduce the load of this instance.',
// runbook_url: 'TBD',
summary: 'Connection usage critical',
},
local bottomPod(query) = 'label_replace( bottomk(1, %s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+")' % query,
local topPod(query) = 'label_replace( topk(1, %s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-postgresql-(.+)-.+")' % query,
spec: {
groups: [
{
name: 'postgresql-storage',
local queries = {
availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}',
availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage,
usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}',
unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1',
},
rules: [
{
alert: 'PostgreSQLPersistentVolumeFillingUp',
annotations: {
description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.',
runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup',
summary: 'PersistentVolume is filling up.',
},
expr: bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries),
'for': '1m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
{
alert: 'PostgreSQLPersistentVolumeFillingUp',
annotations: {
description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.',
runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup',
summary: 'PersistentVolume is filling up.',
},
expr: bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries),
'for': '1h',
labels: {
severity: 'warning',
},
},
],
},
{
name: 'postgresql-memory',
rules: [
{
alert: 'PostgreSQLMemoryCritical',
annotations: {
description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.',
// runbook_url: 'TBD',
summary: 'Memory usage critical',
},
expr: topPod('(container_memory_working_set_bytes{container="patroni"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 85'),
'for': '120m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
],
},
{
name: 'postgresql-connections',
rules: [
{
alert: 'PostgreSQLConnectionsCritical',
annotations: {
description: 'The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours.\n Please reduce the load of this instance.',
// runbook_url: 'TBD',
summary: 'Connection usage critical',
},
expr: topPod('sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) by (pod, namespace)'),
'for': '120m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
],
},
// new
{
name: 'postgresql-replication',
rules: [
{
alert: 'PostgreSQLReplicationCritical',
annotations: {
description: 'The number of replicas for the instance {{ $labels.cluster_name }} in namespace {{ $labels.namespace }}. Please check pod counts in affected namespace.',
runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical',
summary: 'Replication status check',
},
expr: 'pg_replication_slots_active == 0',
'for': '10m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
],
},
{
name: 'postgresql-replication-lag',
rules: [
{
alert: 'PostgreSQLReplicationLagCritical',
annotations: {
description: 'Replication lag size on namespace {{$labels.exported_namespace}} instance ({{$labels.application_name}}) is currently {{ $value | humanize1024}}B behind the leader.',
runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical',
summary: 'Replication lag status check',
},
expr: 'pg_replication_status_lag_size > 1e+09',
'for': '5m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
],
},
{
name: 'postgresql-replication-count',
rules: [
{
alert: 'PostgreSQLPodReplicasCritical',
annotations: {
description: 'Replication is broken in namespace {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}).',
runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical',
summary: 'Replication lag status check',
},
expr: 'kube_statefulset_status_replicas_available{statefulset=~".+", namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"}',
'for': '5m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
],
},
],

expr: std.strReplace(common.topPod('sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) by (pod, namespace)'), 'vshn-replacemeplease', 'vshn-' + std.asciiLower('PostgreSQL')),
'for': '120m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
},
],
},
},
// new
{
name: 'postgresql-replication',
rules: [
{
alert: 'PostgreSQLReplicationCritical',
annotations: {
description: 'The number of replicas for the instance {{ $labels.cluster_name }} in namespace {{ $labels.namespace }}. Please check pod counts in affected namespace.',
runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationCritical',
summary: 'Replication status check',
},
expr: 'pg_replication_slots_active == 0',
'for': '10m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
],
},
{
name: 'postgresql-replication-lag',
rules: [
{
alert: 'PostgreSQLReplicationLagCritical',
annotations: {
description: 'Replication lag size on namespace {{$labels.exported_namespace}} instance ({{$labels.application_name}}) is currently {{ $value | humanize1024}}B behind the leader.',
runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLReplicationLagCritical',
summary: 'Replication lag status check',
},
expr: 'pg_replication_status_lag_size > 1e+09',
'for': '5m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
],
},
{
name: 'postgresql-replication-count',
rules: [
{
alert: 'PostgreSQLPodReplicasCritical',
annotations: {
description: 'Replication is broken in namespace {{$labels.namespace}}, check statefulset ({{$labels.statefulset}}).',
runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLPodReplicasCritical',
summary: 'Replication lag status check',
},
expr: 'kube_statefulset_status_replicas_available{statefulset=~".+", namespace=~"vshn-postgresql-.+"} != kube_statefulset_replicas{statefulset=~".+",namespace=~"vshn-postgresql-.+"}',
'for': '5m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
],
},
]
) + {
patches: [
comp.FromCompositeFieldPathWithTransformSuffix('metadata.labels[crossplane.io/composite]', 'metadata.name', 'prometheusrule'),
comp.FromCompositeFieldPathWithTransformPrefix('metadata.labels[crossplane.io/composite]', 'spec.forProvider.manifest.metadata.namespace', 'vshn-postgresql'),
Expand Down
14 changes: 13 additions & 1 deletion component/component/vshn_redis.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,14 @@ local composition =
},
};

local prometheusRule = common.GeneratePrometheusNonSLORules('redis', 'redis', []) + {
patches: [
comp.FromCompositeFieldPathWithTransformSuffix('metadata.labels[crossplane.io/composite]', 'metadata.name', 'prometheusrule'),
comp.FromCompositeFieldPathWithTransformPrefix('metadata.labels[crossplane.io/composite]', 'spec.forProvider.manifest.metadata.namespace', 'vshn-redis'),
],
};


local redisHelmChart =
{
apiVersion: 'helm.crossplane.io/v1beta1',
Expand All @@ -407,7 +415,10 @@ local composition =
{
name: 'REDIS_EXPORTER_SKIP_TLS_VERIFICATION',
value: 'true',

},
{
name: 'REDIS_EXPORTER_INCL_SYSTEM_METRICS',
value: 'true',
},
],
containerSecurityContext: {
Expand Down Expand Up @@ -537,6 +548,7 @@ local composition =
comp.ToCompositeFieldPath('status.atProvider.manifest.metadata.labels[appuio.io/organization]', 'metadata.labels[appuio.io/organization]'),
],
},
prometheusRule,
{
name: 'namespace-conditions',
base: namespace,
Expand Down
Loading

0 comments on commit 70b68d2

Please sign in to comment.