Skip to content

Commit

Permalink
Switch to sli_minio image
Browse files Browse the repository at this point in the history
  • Loading branch information
Kidswiss committed Nov 1, 2023
1 parent ed76643 commit 200eb18
Show file tree
Hide file tree
Showing 31 changed files with 1,075 additions and 49 deletions.
13 changes: 12 additions & 1 deletion component/class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ parameters:
appcat:
registry: ghcr.io
repository: vshn/appcat
tag: v4.37.0
tag: sli_minio
apiserver:
registry: ghcr.io
repository: vshn/appcat-apiserver
Expand Down Expand Up @@ -168,6 +168,16 @@ parameters:
# If the alert is pending for more than 5m this indicates a real problem.
for: 6m
ticket_alert: {}
minio:
uptime:
objective: 99.9
alerting:
page_alert:
# This should reduce non actionable alerts because of single instance restarts.
# The page alert looks (ammong other things) at the burn rate over the last 5min.
# If the alert is pending for more than 5m this indicates a real problem.
for: 6m
ticket_alert: {}

providers:
cloudscale:
Expand Down Expand Up @@ -420,6 +430,7 @@ parameters:
helmChartVersion: ${appcat:charts:minio:version}
grpcEndpoint: ${appcat:grpcEndpoint}
defaultPlan: standard-1
sla: 99.25
plans:
standard-1:
size:
Expand Down
72 changes: 52 additions & 20 deletions component/component/slos.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,25 @@ local prometheusRule(name) =
spec: patchedRules,
};

local getEvents(serviceName) = {
// The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="' + serviceName + '"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="' + serviceName + '"}[{{.window}}])) by (service, namespace, name, organization, sla)',
total_query: 'sum(rate(appcat_probes_seconds_count{service="' + serviceName + '", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla)',
};

local getEventsHA(serviceName) = {
// The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
error_query: 'sum(rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="true"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="' + serviceName + '"}[{{.window}}])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="' + serviceName + '"}[{{.window}}])) by (service, namespace, name, organization, sla)',
total_query: 'sum(rate(appcat_probes_seconds_count{service="' + serviceName + '", ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)',
};

{
slothInput: {
'vshn-postgresql': [
newSLO('uptime', 'vshn-postgresql', params.slos.vshn.postgres.uptime) {
description: 'Uptime SLO for PostgreSQL by VSHN',
sli: {
events: {
// The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla)',
total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla)',
},
events: getEvents('VSHNPostgreSQL'),
},
alerting+: {
name: 'SLO_AppCat_VSHNPostgreSQLUptime',
Expand All @@ -86,11 +94,7 @@ local prometheusRule(name) =
newSLO('uptime', 'vshn-postgresql-ha', params.slos.vshn.postgres.uptime) {
description: 'Uptime SLO for High Available PostgreSQL by VSHN',
sli: {
events: {
// The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
error_query: 'sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla)',
total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)',
},
events: getEventsHA('VSHNPostgreSQL'),
},
alerting+: {
name: 'SLO_AppCat_HAVSHNPosgtreSQLUptime',
Expand All @@ -109,11 +113,7 @@ local prometheusRule(name) =
newSLO('uptime', 'vshn-redis', params.slos.vshn.redis.uptime) {
description: 'Uptime SLO for Redis by VSHN',
sli: {
events: {
// The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)',
total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla)',
},
events: getEvents('VSHNRedis'),
},
alerting+: {
name: 'SLO_AppCat_VSHNRedisUptime',
Expand All @@ -131,11 +131,7 @@ local prometheusRule(name) =
newSLO('uptime', 'vshn-redis-ha', params.slos.vshn.redis.uptime) {
description: 'Uptime SLO for High Available Redis by VSHN',
sli: {
events: {
// The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)',
total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)',
},
events: getEventsHA('VSHNRedis'),
},
alerting+: {
name: 'SLO_AppCat_HAVSHNRedisUptime',
Expand All @@ -149,6 +145,42 @@ local prometheusRule(name) =
},
},
],
'vshn-minio': [
newSLO('uptime', 'vshn-minio', params.slos.vshn.minio.uptime) {
description: 'Uptime SLO for Minio by VSHN',
sli: {
events: getEvents('VSHNMinio'),
},
alerting+: {
name: 'SLO_AppCat_VSHNMinioUptime',
annotations+: {
summary: 'Probes to Minio by VSHN instance fail',
},
labels+: {
service: 'VSHNMinio',
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
},
},
},
],
'vshn-minio-ha': [
newSLO('uptime', 'vshn-postgresql-ha', params.slos.vshn.minio.uptime) {
description: 'Uptime SLO for High Available Minio by VSHN',
sli: {
events: getEventsHA('VSHNMinio'),
},
alerting+: {
name: 'SLO_AppCat_HAVSHNMinioUptime',
annotations+: {
summary: 'Probes to HA Minio by VSHN instance fail',
},
labels+: {
service: 'VSHNMinio',
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
},
},
},
],
},
Get(name): prometheusRule(name),
}
7 changes: 7 additions & 0 deletions component/component/vshn_minio.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ local crossplane = import 'lib/crossplane.libsonnet';
local common = import 'common.libsonnet';
local xrds = import 'xrds.libsonnet';

local slos = import 'slos.libsonnet';

local inv = kap.inventory();
local params = inv.parameters.appcat;
local minioParams = params.services.vshn.minio;
Expand All @@ -21,6 +23,8 @@ local connectionSecretKeys = [
'AWS_ACCESS_KEY_ID',
];

local promRuleMinioSLA = common.PromRuleSLA(params.services.vshn.minio.sla, 'VSHNMinio');

local minioPlans = common.FilterDisabledParams(minioParams.plans);

local xrd = xrds.XRDFromCRD(
Expand Down Expand Up @@ -97,5 +101,8 @@ if params.services.vshn.enabled && minioParams.enabled then {
'20_xrd_vshn_minio': xrd,
'20_rbac_vshn_minio': xrds.CompositeClusterRoles(xrd),
'21_composition_vshn_minio': composition,
'22_prom_rule_sla_minio': promRuleMinioSLA,
[if std.length(instances) != 0 then '22_minio_instances']: instances,
[if params.services.vshn.enabled && params.services.vshn.minio.enabled then 'sli_exporter/90_slo_vshn_minio']: slos.Get('vshn-minio'),
[if params.services.vshn.enabled && params.services.vshn.minio.enabled then 'sli_exporter/90_slo_vshn_minio_ha']: slos.Get('vshn-minio-ha'),
} else {}
4 changes: 4 additions & 0 deletions component/component/vshn_postgres.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ local crossplane = import 'lib/crossplane.libsonnet';
local common = import 'common.libsonnet';
local xrds = import 'xrds.libsonnet';

local slos = import 'slos.libsonnet';

local inv = kap.inventory();
local params = inv.parameters.appcat;
local pgParams = params.services.vshn.postgres;
Expand Down Expand Up @@ -1101,4 +1103,6 @@ if params.services.vshn.enabled && pgParams.enabled then
'21_composition_vshn_postgresrestore': restoreComp,
'22_prom_rule_sla_postgres': promRulePostgresSLA,
[if isOpenshift then '21_openshift_template_postgresql_vshn']: osTemplate,
[if params.slos.enabled && params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql']: slos.Get('vshn-postgresql'),
[if params.slos.enabled && params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql_ha']: slos.Get('vshn-postgresql-ha'),
} else {}
4 changes: 4 additions & 0 deletions component/component/vshn_redis.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ local crossplane = import 'lib/crossplane.libsonnet';
local common = import 'common.libsonnet';
local xrds = import 'xrds.libsonnet';

local slos = import 'slos.libsonnet';

local inv = kap.inventory();
local params = inv.parameters.appcat;
local redisParams = params.services.vshn.redis;
Expand Down Expand Up @@ -737,4 +739,6 @@ if params.services.vshn.enabled && redisParams.enabled then {
'21_composition_vshn_redis': composition,
'22_prom_rule_sla_redis': promRuleRedisSLA,
[if isOpenshift then '21_openshift_template_redis_vshn']: osTemplate,
[if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis']: slos.Get('vshn-redis'),
[if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis_ha']: slos.Get('vshn-redis-ha'),
} else {}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
env:
- name: PLANS_NAMESPACE
value: syn-appcat
image: ghcr.io/vshn/appcat:v4.37.0
image: ghcr.io/vshn/appcat:sli_minio
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
value: "false"
- name: APPCAT_SLI_VSHNREDIS
value: "false"
image: ghcr.io/vshn/appcat:v4.37.0
image: ghcr.io/vshn/appcat:sli_minio
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,20 @@ rules:
- get
- list
- watch
- apiGroups:
- vshn.appcat.vshn.io
resources:
- vshnminios
verbs:
- get
- list
- watch
- apiGroups:
- vshn.appcat.vshn.io
resources:
- vshnminios/status
verbs:
- get
- apiGroups:
- vshn.appcat.vshn.io
resources:
Expand Down Expand Up @@ -47,6 +61,20 @@ rules:
- vshnredis/status
verbs:
- get
- apiGroups:
- vshn.appcat.vshn.io
resources:
- xvshnminios
verbs:
- get
- list
- watch
- apiGroups:
- vshn.appcat.vshn.io
resources:
- xvshnminios/status
verbs:
- get
- apiGroups:
- vshn.appcat.vshn.io
resources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ spec:
data:
controlNamespace: syn-appcat-control
defaultPlan: standard-1
imageTag: v4.37.0
imageTag: sli_minio
maintenanceSA: helm-based-service-maintenance
minioChartRepository: https://charts.min.io
minioChartVersion: 5.0.13
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
annotations: {}
labels:
name: vshn-vshnminio-sla
name: vshn-vshnminio-sla
namespace: appcat-slos
spec:
groups:
- name: appcat-vshnminio-sla-target
rules:
- expr: vector(99.25)
labels:
service: VSHNMinio
record: sla:objective:ratio
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
env:
- name: PLANS_NAMESPACE
value: syn-appcat
image: ghcr.io/vshn/appcat:v4.37.0
image: ghcr.io/vshn/appcat:sli_minio
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
envFrom:
- secretRef:
name: appcat-sla-reports-creds
image: ghcr.io/vshn/appcat:v4.37.0
image: ghcr.io/vshn/appcat:sli_minio
name: sla-reporter
resources:
limits:
Expand Down
Loading

0 comments on commit 200eb18

Please sign in to comment.