Skip to content

Commit

Permalink
Merge pull request #256 from vshn/Add/non_slo_redis_alerts
Browse files Browse the repository at this point in the history
Add/non slo redis alerts
  • Loading branch information
TheBigLee authored Nov 20, 2023
2 parents 48e4f0b + 11fcebe commit 79ea63f
Show file tree
Hide file tree
Showing 10 changed files with 359 additions and 190 deletions.
28 changes: 1 addition & 27 deletions component/component/common.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -134,31 +134,6 @@ local getAppCatImageString() = params.images.appcat.registry + '/' + params.imag

local getApiserverImageString() = params.images.apiserver.registry + '/' + params.images.apiserver.repository + ':' + getApiserverImageTag();

local promRuleSLA(value, service) = kube._Object('monitoring.coreos.com/v1', 'PrometheusRule', 'vshn-' + std.asciiLower(service) + '-sla') {
metadata+: {
labels: {
name: 'vshn-' + std.asciiLower(service) + '-sla',
},
namespace: params.slos.namespace,
},
spec: {
groups: [
{
name: 'appcat-' + std.asciiLower(service) + '-sla-target',
rules: [
{
expr: 'vector(' + value + ')',
labels: {
service: service,
},
record: 'sla:objective:ratio',
},
],
},
],
},
};

local removeField(obj, name) = {
// We don't want the name field in the actual providerConfig
[k]: obj[k]
Expand All @@ -173,6 +148,7 @@ local argoCDAnnotations() = {
'argocd.argoproj.io/sync-options': 'Prune=false',
};


{
SyncOptions: syncOptions,
VshnMetaDBaaSExoscale(dbname):
Expand All @@ -197,8 +173,6 @@ local argoCDAnnotations() = {
getApiserverImageTag(),
GetApiserverImageString():
getApiserverImageString(),
PromRuleSLA(value, service):
promRuleSLA(value, service),
RemoveField(obj, name):
removeField(obj, name),
ArgoCDAnnotations():
Expand Down
136 changes: 136 additions & 0 deletions component/component/prometheus.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
local kap = import 'lib/kapitan.libjsonnet';
local kube = import 'lib/kube.libjsonnet';

local inv = kap.inventory();
local params = inv.parameters.appcat;


local promRuleSLA(value, service) = kube._Object('monitoring.coreos.com/v1', 'PrometheusRule', 'vshn-' + std.asciiLower(service) + '-sla') {
metadata+: {
labels: {
name: 'vshn-' + std.asciiLower(service) + '-sla',
},
namespace: params.slos.namespace,
},
spec: {
groups: [
{
name: 'appcat-' + std.asciiLower(service) + '-sla-target',
rules: [
{
expr: 'vector(' + value + ')',
labels: {
service: service,
},
record: 'sla:objective:ratio',
},
],
},
],
},
};

local bottomPod(query) = 'label_replace( bottomk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query;
local topPod(query) = 'label_replace( topk(1, %(query)s) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace) kube_namespace_labels, "name", "$1", "namespace", "vshn-replacemeplease-(.+)-.+")' % query;

local generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup) = {
// standardized lowercase regardless of what came as input
local serviceNameLower = std.asciiLower(serviceName),
local toReplace = 'vshn-replacemeplease',
local queries = {
availableStorage: 'kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}',
availablePercent: '(%s / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"})' % queries.availableStorage,
usedStorage: 'kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"}',
unlessExcluded: 'unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1',
},
name: 'prometheusrule',
base: {

apiVersion: 'kubernetes.crossplane.io/v1alpha1',
kind: 'Object',
metadata: {
name: 'prometheusrule',
},
spec: {
providerConfigRef: {
name: 'kubernetes',
},
forProvider+: {
manifest+: {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'PrometheusRule',
metadata: {
name: '%s-rules' % serviceNameLower,
},
spec: {
groups: [
{
name: '%s-storage' % serviceNameLower,
rules: [
{

alert: serviceName + 'PersistentVolumeFillingUp',
annotations: {
description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.',
runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup',
summary: 'PersistentVolume is filling up.',
},
expr: std.strReplace(bottomPod('%(availablePercent)s < 0.03 and %(usedStorage)s > 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower),
'for': '1m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
{
alert: serviceName + 'PersistentVolumeFillingUp',
annotations: {
description: 'Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.',
runbook_url: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup',
summary: 'PersistentVolume is filling up.',
},
expr: std.strReplace(bottomPod('%(availablePercent)s < 0.15 and %(usedStorage)s > 0 and predict_linear(%(availableStorage)s[6h], 4 * 24 * 3600) < 0 %(unlessExcluded)s' % queries), toReplace, 'vshn-' + serviceNameLower),
'for': '1h',
labels: {
severity: 'warning',
},
},
],
},
{
name: std.asciiLower(serviceName) + '-memory',
rules: [
{
alert: serviceName + 'MemoryCritical',
annotations: {
description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.',
runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical',
summary: 'Memory usage critical',
},
expr: std.strReplace(topPod('(container_memory_working_set_bytes{container="%s"} / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"} * 100) > 85') % memoryContainerName, toReplace, 'vshn-' + serviceNameLower),
'for': '120m',
labels: {
severity: 'critical',
syn_team: 'schedar',
},
},
],
},
] + additionalAlertsRuleGroup,
},
},
},
},
},
};

{
GeneratePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup):
generatePrometheusNonSLORules(serviceName, memoryContainerName, additionalAlertsRuleGroup),
PromRuleSLA(value, service):
promRuleSLA(value, service),
TopPod(query):
topPod(query),
BottomPod(query):
bottomPod(query),
}
6 changes: 3 additions & 3 deletions component/component/vshn_minio.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ local comp = import 'lib/appcat-compositions.libsonnet';
local crossplane = import 'lib/crossplane.libsonnet';

local common = import 'common.libsonnet';
local xrds = import 'xrds.libsonnet';

local prom = import 'prometheus.libsonnet';
local slos = import 'slos.libsonnet';
local xrds = import 'xrds.libsonnet';

local inv = kap.inventory();
local params = inv.parameters.appcat;
Expand All @@ -23,7 +23,7 @@ local connectionSecretKeys = [
'AWS_ACCESS_KEY_ID',
];

local promRuleMinioSLA = common.PromRuleSLA(params.services.vshn.minio.sla, 'VSHNMinio');
local promRuleMinioSLA = prom.PromRuleSLA(params.services.vshn.minio.sla, 'VSHNMinio');

local minioPlans = common.FilterDisabledParams(minioParams.plans);

Expand Down
Loading

0 comments on commit 79ea63f

Please sign in to comment.