Skip to content

Commit

Permalink
Refactor prometheus function into separate file, add missing runbook
Browse files Browse the repository at this point in the history
Signed-off-by: Nicolas Bigler <nicolas.bigler@vshn.ch>
  • Loading branch information
TheBigLee committed Nov 17, 2023
1 parent 5adf8b8 commit 1930004
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 12 deletions.
107 changes: 107 additions & 0 deletions component/component/prometheus.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
local com = import 'lib/commodore.libjsonnet';
local kap = import 'lib/kapitan.libjsonnet';
local kube = import 'lib/kube.libjsonnet';


local bottomPod(query) = 'label_replace( bottomk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query;
local topPod(query) = 'label_replace( topk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query;

local generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup) = {
// standardized lowercase regardless of what came as input
local serviceNameLower = std.asciiLower(serviceName),
local toReplace = 'vshn-replacemeplease',
local queries = {
availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}',
availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage,
usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}',
unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1',
},
name: 'prometheusrule',
base: {

apiVersion: 'kubernetes.crossplane.io/v1alpha1',
kind: 'Object',
metadata: {
name: 'prometheusrule',
},
spec: {
providerConfigRef: {
name: 'kubernetes',
},
forProvider+: {
manifest+: {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'PrometheusRule',
metadata: {
name: '%s-rules' % serviceNameLower,
},
spec: {
groups: [
{
name: '%s-storage' % serviceNameLower,
rules: [
{

alert: serviceName + 'PersistentVolumeFillingUp',
annotations: {
description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.',
runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup',
summary: 'PersistentVolume is filling up.',
},
expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower),
'for': '1m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
{
alert: serviceName + 'PersistentVolumeFillingUp',
annotations: {
description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.',
runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup',
summary: 'PersistentVolume is filling up.',
},
expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower),
'for': '1h',
labels: {
severity: 'warning',
},
},
],
},
{
name: std.asciiLower(serviceName) + '-memory',
rules: [
{
alert: serviceName + 'MemoryCritical',
annotations: {
description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.',
runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical',
summary: 'Memory usage critical',
},
expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 85') % memoryContainerName, toReplace, 'vshn-' + serviceNameLower),
'for': '120m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
],
},
] + additionalAlertsRuleGroup,
},
},
},
},
},
};

{
GeneratePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup):
generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup),
topPod(query):
topPod(query),
bottomPod(query):
bottomPod(query),
}
6 changes: 3 additions & 3 deletions component/component/vshn_minio.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ local comp = import 'lib/appcat-compositions.libsonnet';
local crossplane = import 'lib/crossplane.libsonnet';

local common = import 'common.libsonnet';
local xrds = import 'xrds.libsonnet';

local prom = import 'prometheus.libsonnet';
local slos = import 'slos.libsonnet';
local xrds = import 'xrds.libsonnet';

local inv = kap.inventory();
local params = inv.parameters.appcat;
Expand All @@ -23,7 +23,7 @@ local connectionSecretKeys = [
'AWS_ACCESS_KEY_ID',
];

local promRuleMinioSLA = common.PromRuleSLA(params.services.vshn.minio.sla, 'VSHNMinio');
local promRuleMinioSLA = prom.PromRuleSLA(params.services.vshn.minio.sla, 'VSHNMinio');

local minioPlans = common.FilterDisabledParams(minioParams.plans);

Expand Down
9 changes: 4 additions & 5 deletions component/component/vshn_postgres.jsonnet
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
local common = import 'common.libsonnet';
local com = import 'lib/commodore.libjsonnet';
local kap = import 'lib/kapitan.libjsonnet';
local kube = import 'lib/kube.libjsonnet';
Expand All @@ -7,9 +6,9 @@ local comp = import 'lib/appcat-compositions.libsonnet';
local crossplane = import 'lib/crossplane.libsonnet';

local common = import 'common.libsonnet';
local xrds = import 'xrds.libsonnet';

local prom = import 'prometheus.libsonnet';
local slos = import 'slos.libsonnet';
local xrds = import 'xrds.libsonnet';

local inv = kap.inventory();
local params = inv.parameters.appcat;
Expand Down Expand Up @@ -50,7 +49,7 @@ local xrd = xrds.XRDFromCRD(
connectionSecretKeys=connectionSecretKeys,
) + xrds.WithPlanDefaults(pgPlans, pgParams.defaultPlan);

local promRulePostgresSLA = common.PromRuleSLA(params.services.vshn.postgres.sla, 'VSHNPostgreSQL');
local promRulePostgresSLA = prom.PromRuleSLA(params.services.vshn.postgres.sla, 'VSHNPostgreSQL');

local restoreServiceAccount = kube.ServiceAccount('copyserviceaccount') + {
metadata+: {
Expand Down Expand Up @@ -763,7 +762,7 @@ local clusterRestoreConfig = {
};


local prometheusRule = common.GeneratePrometheusNonSLORules(
local prometheusRule = prom.GeneratePrometheusNonSLORules(
'PostgreSQL',
'patroni',
[
Expand Down
8 changes: 4 additions & 4 deletions component/component/vshn_redis.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ local comp = import 'lib/appcat-compositions.libsonnet';
local crossplane = import 'lib/crossplane.libsonnet';

local common = import 'common.libsonnet';
local xrds = import 'xrds.libsonnet';

local prom = import 'prometheus.libsonnet';
local slos = import 'slos.libsonnet';
local xrds = import 'xrds.libsonnet';

local inv = kap.inventory();
local params = inv.parameters.appcat;
Expand Down Expand Up @@ -47,7 +47,7 @@ local xrd = xrds.XRDFromCRD(
connectionSecretKeys=connectionSecretKeys,
) + xrds.WithPlanDefaults(redisPlans, redisParams.defaultPlan);

local promRuleRedisSLA = common.PromRuleSLA(params.services.vshn.redis.sla, 'VSHNRedis');
local promRuleRedisSLA = prom.PromRuleSLA(params.services.vshn.redis.sla, 'VSHNRedis');

local restoreServiceAccount = kube.ServiceAccount('redisrestoreserviceaccount') + {
metadata+: {
Expand Down Expand Up @@ -386,7 +386,7 @@ local composition =
},
};

local prometheusRule = common.GeneratePrometheusNonSLORules('redis', 'redis', []) + {
local prometheusRule = prom.GeneratePrometheusNonSLORules('redis', 'redis', []) + {
patches: [
comp.FromCompositeFieldPathWithTransformSuffix('metadata.labels[crossplane.io/composite]', 'metadata.name', 'prometheusrule'),
comp.FromCompositeFieldPathWithTransformPrefix('metadata.labels[crossplane.io/composite]', 'spec.forProvider.manifest.metadata.namespace', 'vshn-redis'),
Expand Down
9 changes: 9 additions & 0 deletions docs/modules/ROOT/pages/runbooks/vshn-generic.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
= Generic alerts

[[memoryCritical]]
== MemoryCritical

The instance uses more than 85% of the memory limit set on the pod.
A further increase in memory usage might lead to the pod being OOM-killed by Kubernetes.

Either adjust the limits of the affected instance or reduce the workload on the instance to lower memory consumption (this depends highly on the used service).

0 comments on commit 1930004

Please sign in to comment.