From 713106ca75d13c8be1d5a26216470db3515fcb9a Mon Sep 17 00:00:00 2001 From: Simon Beck Date: Wed, 1 Nov 2023 09:37:03 +0100 Subject: [PATCH] Switch to sli_minio image --- component/class/defaults.yml | 13 +- component/component/slos.libsonnet | 72 ++++-- component/component/vshn_minio.jsonnet | 7 + component/component/vshn_postgres.jsonnet | 4 + component/component/vshn_redis.jsonnet | 4 + .../controllers/appcat/30_deployment.yaml | 2 +- ...appcat-sliexporter-controller-manager.yaml | 2 +- ...ppcat-sliexporter-appcat-sli-exporter.yaml | 28 +++ .../appcat/21_composition_vshn_minio.yaml | 2 +- .../appcat/appcat/22_prom_rule_sla_minio.yaml | 16 ++ .../controllers/appcat/30_deployment.yaml | 2 +- .../appcat/sla_reporter/01_cronjob.yaml | 2 +- .../sli_exporter/90_slo_vshn_minio.yaml | 206 ++++++++++++++++++ .../sli_exporter/90_slo_vshn_minio_ha.yaml | 206 ++++++++++++++++++ ...appcat-sliexporter-controller-manager.yaml | 2 +- ...ppcat-sliexporter-appcat-sli-exporter.yaml | 28 +++ ...appcat-sliexporter-controller-manager.yaml | 2 +- ...ppcat-sliexporter-appcat-sli-exporter.yaml | 28 +++ .../appcat/21_composition_vshn_minio.yaml | 2 +- .../appcat/21_composition_vshn_postgres.yaml | 2 +- .../21_composition_vshn_postgresrestore.yaml | 2 +- .../appcat/21_composition_vshn_redis.yaml | 2 +- .../appcat/appcat/22_prom_rule_sla_minio.yaml | 16 ++ .../controllers/appcat/30_deployment.yaml | 2 +- .../appcat/sla_reporter/01_cronjob.yaml | 2 +- .../sli_exporter/90_slo_vshn_minio.yaml | 206 ++++++++++++++++++ .../sli_exporter/90_slo_vshn_minio_ha.yaml | 206 ++++++++++++++++++ .../sli_exporter/90_slo_vshn_redis.yaml | 14 +- .../sli_exporter/90_slo_vshn_redis_ha.yaml | 14 +- ...appcat-sliexporter-controller-manager.yaml | 2 +- ...ppcat-sliexporter-appcat-sli-exporter.yaml | 28 +++ package/main.yaml | 4 +- 32 files changed, 1077 insertions(+), 51 deletions(-) create mode 100644 component/tests/golden/minio/appcat/appcat/22_prom_rule_sla_minio.yaml create mode 100644 component/tests/golden/minio/appcat/appcat/sli_exporter/90_slo_vshn_minio.yaml create mode 100644 component/tests/golden/minio/appcat/appcat/sli_exporter/90_slo_vshn_minio_ha.yaml create mode 100644 component/tests/golden/vshn/appcat/appcat/22_prom_rule_sla_minio.yaml create mode 100644 component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_minio.yaml create mode 100644 component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_minio_ha.yaml diff --git a/component/class/defaults.yml b/component/class/defaults.yml index 69a9cadbe..6f2e4ebe7 100644 --- a/component/class/defaults.yml +++ b/component/class/defaults.yml @@ -39,7 +39,7 @@ parameters: appcat: registry: ghcr.io repository: vshn/appcat - tag: v4.37.0 + tag: sli_minio apiserver: registry: ghcr.io repository: vshn/appcat-apiserver @@ -168,6 +168,16 @@ parameters: # If the alert is pending for more than 5m this indicates a real problem. for: 6m ticket_alert: {} + minio: + uptime: + objective: 99.9 + alerting: + page_alert: + # This should reduce non actionable alerts because of single instance restarts. + # The page alert looks (ammong other things) at the burn rate over the last 5min. + # If the alert is pending for more than 5m this indicates a real problem. + for: 6m + ticket_alert: {} providers: cloudscale: @@ -420,6 +430,7 @@ parameters: helmChartVersion: ${appcat:charts:minio:version} grpcEndpoint: ${appcat:grpcEndpoint} defaultPlan: standard-1 + sla: 99.25 plans: standard-1: size: diff --git a/component/component/slos.libsonnet b/component/component/slos.libsonnet index c4133f45a..da6766c7f 100644 --- a/component/component/slos.libsonnet +++ b/component/component/slos.libsonnet @@ -58,17 +58,25 @@ local prometheusRule(name) = spec: patchedRules, }; +local getEvents(serviceName) = { + // The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error + error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="' + serviceName + '"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="' + serviceName + '"}[{{.window}}])) by (service, namespace, name, organization, sla)', + total_query: 'sum(rate(appcat_probes_seconds_count{service="' + serviceName + '", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla)', +}; + +local getEventsHA(serviceName) = { + // The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error + error_query: 'sum(rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="true"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="' + serviceName + '"}[{{.window}}])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="' + serviceName + '"}[{{.window}}])) by (service, namespace, name, organization, sla)', + total_query: 'sum(rate(appcat_probes_seconds_count{service="' + serviceName + '", ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)', +}; + { slothInput: { 'vshn-postgresql': [ newSLO('uptime', 'vshn-postgresql', params.slos.vshn.postgres.uptime) { description: 'Uptime SLO for PostgreSQL by VSHN', sli: { - events: { - // The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error - error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla)', - total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla)', - }, + events: getEvents('VSHNPostgreSQL'), }, alerting+: { name: 'SLO_AppCat_VSHNPostgreSQLUptime', @@ -86,11 +94,7 @@ local prometheusRule(name) = newSLO('uptime', 'vshn-postgresql-ha', params.slos.vshn.postgres.uptime) { description: 'Uptime SLO for High Available PostgreSQL by VSHN', sli: { - events: { - // The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error - error_query: 'sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla)', - total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)', - }, + events: getEventsHA('VSHNPostgreSQL'), }, alerting+: { name: 'SLO_AppCat_HAVSHNPosgtreSQLUptime', @@ -109,11 +113,7 @@ local prometheusRule(name) = newSLO('uptime', 'vshn-redis', params.slos.vshn.redis.uptime) { description: 'Uptime SLO for Redis by VSHN', sli: { - events: { - // The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error - error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)', - total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla)', - }, + events: getEvents('VSHNRedis'), }, alerting+: { name: 'SLO_AppCat_VSHNRedisUptime', @@ -131,11 +131,7 @@ local prometheusRule(name) = newSLO('uptime', 'vshn-redis-ha', params.slos.vshn.redis.uptime) { description: 'Uptime SLO for High Available Redis by VSHN', sli: { - events: { - // The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error - error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)', - total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)', - }, + events: getEventsHA('VSHNRedis'), }, alerting+: { name: 'SLO_AppCat_HAVSHNRedisUptime', @@ -149,6 +145,42 @@ local prometheusRule(name) = }, }, ], + 'vshn-minio': [ + newSLO('uptime', 'vshn-minio', params.slos.vshn.minio.uptime) { + description: 'Uptime SLO for Minio by VSHN', + sli: { + events: getEvents('VSHNMinio'), + }, + alerting+: { + name: 'SLO_AppCat_VSHNMinioUptime', + annotations+: { + summary: 'Probes to Minio by VSHN instance fail', + }, + labels+: { + service: 'VSHNMinio', + OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}', + }, + }, + }, + ], + 'vshn-minio-ha': [ + newSLO('uptime', 'vshn-postgresql-ha', params.slos.vshn.minio.uptime) { + description: 'Uptime SLO for High Available Minio by VSHN', + sli: { + events: getEventsHA('VSHNMinio'), + }, + alerting+: { + name: 'SLO_AppCat_HAVSHNMinioUptime', + annotations+: { + summary: 'Probes to HA Minio by VSHN instance fail', + }, + labels+: { + service: 'VSHNMinio', + OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}', + }, + }, + }, + ], }, Get(name): prometheusRule(name), } diff --git a/component/component/vshn_minio.jsonnet b/component/component/vshn_minio.jsonnet index 741954215..6649c40be 100644 --- a/component/component/vshn_minio.jsonnet +++ b/component/component/vshn_minio.jsonnet @@ -8,6 +8,8 @@ local crossplane = import 'lib/crossplane.libsonnet'; local common = import 'common.libsonnet'; local xrds = import 'xrds.libsonnet'; +local slos = import 'slos.libsonnet'; + local inv = kap.inventory(); local params = inv.parameters.appcat; local minioParams = params.services.vshn.minio; @@ -21,6 +23,8 @@ local connectionSecretKeys = [ 'AWS_ACCESS_KEY_ID', ]; +local promRuleMinioSLA = common.PromRuleSLA(params.services.vshn.minio.sla, 'VSHNMinio'); + local minioPlans = common.FilterDisabledParams(minioParams.plans); local xrd = xrds.XRDFromCRD( @@ -97,5 +101,8 @@ if params.services.vshn.enabled && minioParams.enabled then { '20_xrd_vshn_minio': xrd, '20_rbac_vshn_minio': xrds.CompositeClusterRoles(xrd), '21_composition_vshn_minio': composition, + '22_prom_rule_sla_minio': promRuleMinioSLA, [if std.length(instances) != 0 then '22_minio_instances']: instances, + [if params.services.vshn.enabled && params.services.vshn.minio.enabled then 'sli_exporter/90_slo_vshn_minio']: slos.Get('vshn-minio'), + [if params.services.vshn.enabled && params.services.vshn.minio.enabled then 'sli_exporter/90_slo_vshn_minio_ha']: slos.Get('vshn-minio-ha'), } else {} diff --git a/component/component/vshn_postgres.jsonnet b/component/component/vshn_postgres.jsonnet index 7fb39e1a6..a1fc80a9e 100644 --- a/component/component/vshn_postgres.jsonnet +++ b/component/component/vshn_postgres.jsonnet @@ -9,6 +9,8 @@ local crossplane = import 'lib/crossplane.libsonnet'; local common = import 'common.libsonnet'; local xrds = import 'xrds.libsonnet'; +local slos = import 'slos.libsonnet'; + local inv = kap.inventory(); local params = inv.parameters.appcat; local pgParams = params.services.vshn.postgres; @@ -1101,4 +1103,6 @@ if params.services.vshn.enabled && pgParams.enabled then '21_composition_vshn_postgresrestore': restoreComp, '22_prom_rule_sla_postgres': promRulePostgresSLA, [if isOpenshift then '21_openshift_template_postgresql_vshn']: osTemplate, + [if params.slos.enabled && params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql']: slos.Get('vshn-postgresql'), + [if params.slos.enabled && params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql_ha']: slos.Get('vshn-postgresql-ha'), } else {} diff --git a/component/component/vshn_redis.jsonnet b/component/component/vshn_redis.jsonnet index 907fc81f6..c48f28989 100644 --- a/component/component/vshn_redis.jsonnet +++ b/component/component/vshn_redis.jsonnet @@ -8,6 +8,8 @@ local crossplane = import 'lib/crossplane.libsonnet'; local common = import 'common.libsonnet'; local xrds = import 'xrds.libsonnet'; +local slos = import 'slos.libsonnet'; + local inv = kap.inventory(); local params = inv.parameters.appcat; local redisParams = params.services.vshn.redis; @@ -737,4 +739,6 @@ if params.services.vshn.enabled && redisParams.enabled then { '21_composition_vshn_redis': composition, '22_prom_rule_sla_redis': promRuleRedisSLA, [if isOpenshift then '21_openshift_template_redis_vshn']: osTemplate, + [if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis']: slos.Get('vshn-redis'), + [if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis_ha']: slos.Get('vshn-redis-ha'), } else {} diff --git a/component/tests/golden/controllers/appcat/appcat/controllers/appcat/30_deployment.yaml b/component/tests/golden/controllers/appcat/appcat/controllers/appcat/30_deployment.yaml index 8becd57aa..fccec260b 100644 --- a/component/tests/golden/controllers/appcat/appcat/controllers/appcat/30_deployment.yaml +++ b/component/tests/golden/controllers/appcat/appcat/controllers/appcat/30_deployment.yaml @@ -23,7 +23,7 @@ spec: env: - name: PLANS_NAMESPACE value: syn-appcat - image: ghcr.io/vshn/appcat:v4.37.0 + image: ghcr.io/vshn/appcat:sli_minio livenessProbe: httpGet: path: /healthz diff --git a/component/tests/golden/defaults/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml b/component/tests/golden/defaults/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml index 62e33f810..418b0a35a 100644 --- a/component/tests/golden/defaults/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml +++ b/component/tests/golden/defaults/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml @@ -28,7 +28,7 @@ spec: value: "false" - name: APPCAT_SLI_VSHNREDIS value: "false" - image: ghcr.io/vshn/appcat:v4.37.0 + image: ghcr.io/vshn/appcat:sli_minio livenessProbe: httpGet: path: /healthz diff --git a/component/tests/golden/defaults/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml b/component/tests/golden/defaults/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml index aea288f1b..36c77088d 100644 --- a/component/tests/golden/defaults/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml +++ b/component/tests/golden/defaults/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml @@ -19,6 +19,20 @@ rules: - get - list - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - vshnminios + verbs: + - get + - list + - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - vshnminios/status + verbs: + - get - apiGroups: - vshn.appcat.vshn.io resources: @@ -47,6 +61,20 @@ rules: - vshnredis/status verbs: - get +- apiGroups: + - vshn.appcat.vshn.io + resources: + - xvshnminios + verbs: + - get + - list + - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - xvshnminios/status + verbs: + - get - apiGroups: - vshn.appcat.vshn.io resources: diff --git a/component/tests/golden/minio/appcat/appcat/21_composition_vshn_minio.yaml b/component/tests/golden/minio/appcat/appcat/21_composition_vshn_minio.yaml index 357e91a7b..b48e75fea 100644 --- a/component/tests/golden/minio/appcat/appcat/21_composition_vshn_minio.yaml +++ b/component/tests/golden/minio/appcat/appcat/21_composition_vshn_minio.yaml @@ -25,7 +25,7 @@ spec: data: controlNamespace: syn-appcat-control defaultPlan: standard-1 - imageTag: v4.37.0 + imageTag: sli_minio maintenanceSA: helm-based-service-maintenance minioChartRepository: https://charts.min.io minioChartVersion: 5.0.13 diff --git a/component/tests/golden/minio/appcat/appcat/22_prom_rule_sla_minio.yaml b/component/tests/golden/minio/appcat/appcat/22_prom_rule_sla_minio.yaml new file mode 100644 index 000000000..41625481a --- /dev/null +++ b/component/tests/golden/minio/appcat/appcat/22_prom_rule_sla_minio.yaml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: {} + labels: + name: vshn-vshnminio-sla + name: vshn-vshnminio-sla + namespace: appcat-slos +spec: + groups: + - name: appcat-vshnminio-sla-target + rules: + - expr: vector(99.25) + labels: + service: VSHNMinio + record: sla:objective:ratio diff --git a/component/tests/golden/minio/appcat/appcat/controllers/appcat/30_deployment.yaml b/component/tests/golden/minio/appcat/appcat/controllers/appcat/30_deployment.yaml index 8becd57aa..fccec260b 100644 --- a/component/tests/golden/minio/appcat/appcat/controllers/appcat/30_deployment.yaml +++ b/component/tests/golden/minio/appcat/appcat/controllers/appcat/30_deployment.yaml @@ -23,7 +23,7 @@ spec: env: - name: PLANS_NAMESPACE value: syn-appcat - image: ghcr.io/vshn/appcat:v4.37.0 + image: ghcr.io/vshn/appcat:sli_minio livenessProbe: httpGet: path: /healthz diff --git a/component/tests/golden/minio/appcat/appcat/sla_reporter/01_cronjob.yaml b/component/tests/golden/minio/appcat/appcat/sla_reporter/01_cronjob.yaml index a18cc78fa..37a84cd2c 100644 --- a/component/tests/golden/minio/appcat/appcat/sla_reporter/01_cronjob.yaml +++ b/component/tests/golden/minio/appcat/appcat/sla_reporter/01_cronjob.yaml @@ -30,7 +30,7 @@ spec: envFrom: - secretRef: name: appcat-sla-reports-creds - image: ghcr.io/vshn/appcat:v4.37.0 + image: ghcr.io/vshn/appcat:sli_minio name: sla-reporter resources: limits: diff --git a/component/tests/golden/minio/appcat/appcat/sli_exporter/90_slo_vshn_minio.yaml b/component/tests/golden/minio/appcat/appcat/sli_exporter/90_slo_vshn_minio.yaml new file mode 100644 index 000000000..8123744de --- /dev/null +++ b/component/tests/golden/minio/appcat/appcat/sli_exporter/90_slo_vshn_minio.yaml @@ -0,0 +1,206 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: {} + labels: + name: vshn-minio + name: vshn-minio + namespace: appcat-slos +spec: + groups: + - name: sloth-slo-sli-recordings-appcat-vshn-minio-uptime + rules: + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[5m])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[5m])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 5m + record: slo:sli_error:ratio_rate5m + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[30m])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[30m])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 30m + record: slo:sli_error:ratio_rate30m + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[1h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 1h + record: slo:sli_error:ratio_rate1h + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[2h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[2h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 2h + record: slo:sli_error:ratio_rate2h + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[6h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[6h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 6h + record: slo:sli_error:ratio_rate6h + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1d])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[1d])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 1d + record: slo:sli_error:ratio_rate1d + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[3d])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[3d])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 3d + record: slo:sli_error:ratio_rate3d + - expr: | + sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"}[30d]) + / ignoring (sloth_window) + count_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"}[30d]) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 30d + record: slo:sli_error:ratio_rate30d + - name: sloth-slo-meta-recordings-appcat-vshn-minio-uptime + rules: + - expr: vector(0.9990000000000001) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:objective:ratio + - expr: vector(1-0.9990000000000001) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:error_budget:ratio + - expr: vector(30) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:time_period:days + - expr: | + slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:current_burn_rate:ratio + - expr: | + slo:sli_error:ratio_rate30d{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:period_burn_rate:ratio + - expr: 1 - slo:period_burn_rate:ratio{sloth_id="appcat-vshn-minio-uptime", + sloth_service="appcat-vshn-minio", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:period_error_budget_remaining:ratio + - expr: vector(1) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_mode: cli-gen-prom + sloth_objective: '99.9' + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_spec: prometheus/v1 + sloth_version: v0.11.0 + record: sloth_slo_info + - name: sloth-slo-alerts-appcat-vshn-minio-uptime + rules: + - alert: SLO_AppCat_VSHNMinioUptime + annotations: + for: 6m + summary: Probes to Minio by VSHN instance fail + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) + ) + for: 6m + labels: + OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end + }}' + service: VSHNMinio + severity: critical + slo: 'true' + sloth_severity: page + syn: 'true' + syn_component: appcat + syn_team: schedar + - alert: SLO_AppCat_VSHNMinioUptime + annotations: + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-minio.html#uptime + summary: Probes to Minio by VSHN instance fail + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) + ) + labels: + OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end + }}' + service: VSHNMinio + severity: warning + slo: 'true' + sloth_severity: ticket + syn: 'true' + syn_component: appcat + syn_team: schedar diff --git a/component/tests/golden/minio/appcat/appcat/sli_exporter/90_slo_vshn_minio_ha.yaml b/component/tests/golden/minio/appcat/appcat/sli_exporter/90_slo_vshn_minio_ha.yaml new file mode 100644 index 000000000..e17ed372f --- /dev/null +++ b/component/tests/golden/minio/appcat/appcat/sli_exporter/90_slo_vshn_minio_ha.yaml @@ -0,0 +1,206 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: {} + labels: + name: vshn-minio-ha + name: vshn-minio-ha + namespace: appcat-slos +spec: + groups: + - name: sloth-slo-sli-recordings-appcat-vshn-minio-ha-uptime + rules: + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[5m])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[5m])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[5m])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 5m + record: slo:sli_error:ratio_rate5m + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[30m])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[30m])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[30m])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 30m + record: slo:sli_error:ratio_rate30m + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[1h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 1h + record: slo:sli_error:ratio_rate1h + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[2h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[2h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[2h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 2h + record: slo:sli_error:ratio_rate2h + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[6h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[6h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[6h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 6h + record: slo:sli_error:ratio_rate6h + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1d])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1d])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[1d])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 1d + record: slo:sli_error:ratio_rate1d + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[3d])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[3d])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[3d])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 3d + record: slo:sli_error:ratio_rate3d + - expr: | + sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"}[30d]) + / ignoring (sloth_window) + count_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"}[30d]) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 30d + record: slo:sli_error:ratio_rate30d + - name: sloth-slo-meta-recordings-appcat-vshn-minio-ha-uptime + rules: + - expr: vector(0.9990000000000001) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:objective:ratio + - expr: vector(1-0.9990000000000001) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:error_budget:ratio + - expr: vector(30) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:time_period:days + - expr: | + slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:current_burn_rate:ratio + - expr: | + slo:sli_error:ratio_rate30d{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:period_burn_rate:ratio + - expr: 1 - slo:period_burn_rate:ratio{sloth_id="appcat-vshn-minio-ha-uptime", + sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:period_error_budget_remaining:ratio + - expr: vector(1) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_mode: cli-gen-prom + sloth_objective: '99.9' + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_spec: prometheus/v1 + sloth_version: v0.11.0 + record: sloth_slo_info + - name: sloth-slo-alerts-appcat-vshn-minio-ha-uptime + rules: + - alert: SLO_AppCat_HAVSHNMinioUptime + annotations: + for: 6m + summary: Probes to HA Minio by VSHN instance fail + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) + ) + for: 6m + labels: + OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end + }}' + service: VSHNMinio + severity: critical + slo: 'true' + sloth_severity: page + syn: 'true' + syn_component: appcat + syn_team: schedar + - alert: SLO_AppCat_HAVSHNMinioUptime + annotations: + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql-ha.html#uptime + summary: Probes to HA Minio by VSHN instance fail + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) + ) + labels: + OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end + }}' + service: VSHNMinio + severity: warning + slo: 'true' + sloth_severity: ticket + syn: 'true' + syn_component: appcat + syn_team: schedar diff --git a/component/tests/golden/minio/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml b/component/tests/golden/minio/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml index 62e33f810..418b0a35a 100644 --- a/component/tests/golden/minio/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml +++ b/component/tests/golden/minio/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml @@ -28,7 +28,7 @@ spec: value: "false" - name: APPCAT_SLI_VSHNREDIS value: "false" - image: ghcr.io/vshn/appcat:v4.37.0 + image: ghcr.io/vshn/appcat:sli_minio livenessProbe: httpGet: path: /healthz diff --git a/component/tests/golden/minio/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml b/component/tests/golden/minio/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml index aea288f1b..36c77088d 100644 --- a/component/tests/golden/minio/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml +++ b/component/tests/golden/minio/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml @@ -19,6 +19,20 @@ rules: - get - list - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - vshnminios + verbs: + - get + - list + - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - vshnminios/status + verbs: + - get - apiGroups: - vshn.appcat.vshn.io resources: @@ -47,6 +61,20 @@ rules: - vshnredis/status verbs: - get +- apiGroups: + - vshn.appcat.vshn.io + resources: + - xvshnminios + verbs: + - get + - list + - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - xvshnminios/status + verbs: + - get - apiGroups: - vshn.appcat.vshn.io resources: diff --git a/component/tests/golden/openshift/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml b/component/tests/golden/openshift/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml index 62e33f810..418b0a35a 100644 --- a/component/tests/golden/openshift/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml +++ b/component/tests/golden/openshift/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml @@ -28,7 +28,7 @@ spec: value: "false" - name: APPCAT_SLI_VSHNREDIS value: "false" - image: ghcr.io/vshn/appcat:v4.37.0 + image: ghcr.io/vshn/appcat:sli_minio livenessProbe: httpGet: path: /healthz diff --git a/component/tests/golden/openshift/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml b/component/tests/golden/openshift/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml index aea288f1b..36c77088d 100644 --- a/component/tests/golden/openshift/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml +++ b/component/tests/golden/openshift/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml @@ -19,6 +19,20 @@ rules: - get - list - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - vshnminios + verbs: + - get + - list + - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - vshnminios/status + verbs: + - get - apiGroups: - vshn.appcat.vshn.io resources: @@ -47,6 +61,20 @@ rules: - vshnredis/status verbs: - get +- apiGroups: + - vshn.appcat.vshn.io + resources: + - xvshnminios + verbs: + - get + - list + - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - xvshnminios/status + verbs: + - get - apiGroups: - vshn.appcat.vshn.io resources: diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_minio.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_minio.yaml index 610b3cf49..a4494cd71 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_minio.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_minio.yaml @@ -25,7 +25,7 @@ spec: data: controlNamespace: syn-appcat-control defaultPlan: standard-1 - imageTag: v4.37.0 + imageTag: sli_minio maintenanceSA: helm-based-service-maintenance minioChartRepository: https://charts.min.io minioChartVersion: 5.0.13 diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index 5090a386c..207fdffaf 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -33,7 +33,7 @@ spec: emailAlertingSmtpHost: smtp.eu.mailgun.org:465 emailAlertingSmtpUsername: myuser@example.com externalDatabaseConnectionsEnabled: 'true' - imageTag: v4.37.0 + imageTag: sli_minio quotasEnabled: 'false' sgNamespace: stackgres sideCars: '{"clusterController": {"limits": {"cpu": "32m", "memory": "2Gi"}, diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index 66fd88cff..cbecb3f46 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -33,7 +33,7 @@ spec: emailAlertingSmtpHost: smtp.eu.mailgun.org:465 emailAlertingSmtpUsername: myuser@example.com externalDatabaseConnectionsEnabled: 'true' - imageTag: v4.37.0 + imageTag: sli_minio quotasEnabled: 'false' sgNamespace: stackgres sideCars: '{"clusterController": {"limits": {"cpu": "32m", "memory": "2Gi"}, diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index 08b7d6d53..99a9841f8 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -30,7 +30,7 @@ spec: data: bucketRegion: lpg controlNamespace: syn-appcat-control - imageTag: v4.37.0 + imageTag: sli_minio maintenanceSA: helm-based-service-maintenance quotasEnabled: 'false' restoreSA: redisrestoreserviceaccount diff --git a/component/tests/golden/vshn/appcat/appcat/22_prom_rule_sla_minio.yaml b/component/tests/golden/vshn/appcat/appcat/22_prom_rule_sla_minio.yaml new file mode 100644 index 000000000..41625481a --- /dev/null +++ b/component/tests/golden/vshn/appcat/appcat/22_prom_rule_sla_minio.yaml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: {} + labels: + name: vshn-vshnminio-sla + name: vshn-vshnminio-sla + namespace: appcat-slos +spec: + groups: + - name: appcat-vshnminio-sla-target + rules: + - expr: vector(99.25) + labels: + service: VSHNMinio + record: sla:objective:ratio diff --git a/component/tests/golden/vshn/appcat/appcat/controllers/appcat/30_deployment.yaml b/component/tests/golden/vshn/appcat/appcat/controllers/appcat/30_deployment.yaml index 8becd57aa..fccec260b 100644 --- a/component/tests/golden/vshn/appcat/appcat/controllers/appcat/30_deployment.yaml +++ b/component/tests/golden/vshn/appcat/appcat/controllers/appcat/30_deployment.yaml @@ -23,7 +23,7 @@ spec: env: - name: PLANS_NAMESPACE value: syn-appcat - image: ghcr.io/vshn/appcat:v4.37.0 + image: ghcr.io/vshn/appcat:sli_minio livenessProbe: httpGet: path: /healthz diff --git a/component/tests/golden/vshn/appcat/appcat/sla_reporter/01_cronjob.yaml b/component/tests/golden/vshn/appcat/appcat/sla_reporter/01_cronjob.yaml index b75d27661..09f125c9e 100644 --- a/component/tests/golden/vshn/appcat/appcat/sla_reporter/01_cronjob.yaml +++ b/component/tests/golden/vshn/appcat/appcat/sla_reporter/01_cronjob.yaml @@ -30,7 +30,7 @@ spec: envFrom: - secretRef: name: appcat-sla-reports-creds - image: ghcr.io/vshn/appcat:v4.37.0 + image: ghcr.io/vshn/appcat:sli_minio name: sla-reporter resources: limits: diff --git a/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_minio.yaml b/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_minio.yaml new file mode 100644 index 000000000..8123744de --- /dev/null +++ b/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_minio.yaml @@ -0,0 +1,206 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: {} + labels: + name: vshn-minio + name: vshn-minio + namespace: appcat-slos +spec: + groups: + - name: sloth-slo-sli-recordings-appcat-vshn-minio-uptime + rules: + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[5m])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[5m])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 5m + record: slo:sli_error:ratio_rate5m + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[30m])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[30m])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 30m + record: slo:sli_error:ratio_rate30m + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[1h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 1h + record: slo:sli_error:ratio_rate1h + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[2h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[2h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 2h + record: slo:sli_error:ratio_rate2h + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[6h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[6h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 6h + record: slo:sli_error:ratio_rate6h + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1d])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[1d])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 1d + record: slo:sli_error:ratio_rate1d + - expr: | + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="false"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[3d])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="false"}[3d])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 3d + record: slo:sli_error:ratio_rate3d + - expr: | + sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"}[30d]) + / ignoring (sloth_window) + count_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"}[30d]) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_window: 30d + record: slo:sli_error:ratio_rate30d + - name: sloth-slo-meta-recordings-appcat-vshn-minio-uptime + rules: + - expr: vector(0.9990000000000001) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:objective:ratio + - expr: vector(1-0.9990000000000001) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:error_budget:ratio + - expr: vector(30) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:time_period:days + - expr: | + slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:current_burn_rate:ratio + - expr: | + slo:sli_error:ratio_rate30d{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:period_burn_rate:ratio + - expr: 1 - slo:period_burn_rate:ratio{sloth_id="appcat-vshn-minio-uptime", + sloth_service="appcat-vshn-minio", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_service: appcat-vshn-minio + sloth_slo: uptime + record: slo:period_error_budget_remaining:ratio + - expr: vector(1) + labels: + sloth_id: appcat-vshn-minio-uptime + sloth_mode: cli-gen-prom + sloth_objective: '99.9' + sloth_service: appcat-vshn-minio + sloth_slo: uptime + sloth_spec: prometheus/v1 + sloth_version: v0.11.0 + record: sloth_slo_info + - name: sloth-slo-alerts-appcat-vshn-minio-uptime + rules: + - alert: SLO_AppCat_VSHNMinioUptime + annotations: + for: 6m + summary: Probes to Minio by VSHN instance fail + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) + ) + for: 6m + labels: + OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end + }}' + service: VSHNMinio + severity: critical + slo: 'true' + sloth_severity: page + syn: 'true' + syn_component: appcat + syn_team: schedar + - alert: SLO_AppCat_VSHNMinioUptime + annotations: + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-minio.html#uptime + summary: Probes to Minio by VSHN instance fail + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) + ) + labels: + OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end + }}' + service: VSHNMinio + severity: warning + slo: 'true' + sloth_severity: ticket + syn: 'true' + syn_component: appcat + syn_team: schedar diff --git a/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_minio_ha.yaml b/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_minio_ha.yaml new file mode 100644 index 000000000..e17ed372f --- /dev/null +++ b/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_minio_ha.yaml @@ -0,0 +1,206 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: {} + labels: + name: vshn-minio-ha + name: vshn-minio-ha + namespace: appcat-slos +spec: + groups: + - name: sloth-slo-sli-recordings-appcat-vshn-minio-ha-uptime + rules: + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[5m])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[5m])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[5m])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 5m + record: slo:sli_error:ratio_rate5m + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[30m])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[30m])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[30m])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 30m + record: slo:sli_error:ratio_rate30m + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[1h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 1h + record: slo:sli_error:ratio_rate1h + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[2h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[2h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[2h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 2h + record: slo:sli_error:ratio_rate2h + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[6h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[6h])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[6h])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 6h + record: slo:sli_error:ratio_rate6h + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1d])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[1d])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[1d])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 1d + record: slo:sli_error:ratio_rate1d + - expr: | + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNMinio", ha="true"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[3d])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNMinio"}[3d])) by (service, namespace, name, organization, sla)) + / + (sum(rate(appcat_probes_seconds_count{service="VSHNMinio", ha="true"}[3d])) by (service, namespace, name, organization, sla)) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 3d + record: slo:sli_error:ratio_rate3d + - expr: | + sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"}[30d]) + / ignoring (sloth_window) + count_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"}[30d]) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_window: 30d + record: slo:sli_error:ratio_rate30d + - name: sloth-slo-meta-recordings-appcat-vshn-minio-ha-uptime + rules: + - expr: vector(0.9990000000000001) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:objective:ratio + - expr: vector(1-0.9990000000000001) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:error_budget:ratio + - expr: vector(30) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:time_period:days + - expr: | + slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:current_burn_rate:ratio + - expr: | + slo:sli_error:ratio_rate30d{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:period_burn_rate:ratio + - expr: 1 - slo:period_burn_rate:ratio{sloth_id="appcat-vshn-minio-ha-uptime", + sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + record: slo:period_error_budget_remaining:ratio + - expr: vector(1) + labels: + sloth_id: appcat-vshn-minio-ha-uptime + sloth_mode: cli-gen-prom + sloth_objective: '99.9' + sloth_service: appcat-vshn-minio-ha + sloth_slo: uptime + sloth_spec: prometheus/v1 + sloth_version: v0.11.0 + record: sloth_slo_info + - name: sloth-slo-alerts-appcat-vshn-minio-ha-uptime + rules: + - alert: SLO_AppCat_HAVSHNMinioUptime + annotations: + for: 6m + summary: Probes to HA Minio by VSHN instance fail + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) + ) + for: 6m + labels: + OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end + }}' + service: VSHNMinio + severity: critical + slo: 'true' + sloth_severity: page + syn: 'true' + syn_component: appcat + syn_team: schedar + - alert: SLO_AppCat_HAVSHNMinioUptime + annotations: + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql-ha.html#uptime + summary: Probes to HA Minio by VSHN instance fail + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) + ) + labels: + OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end + }}' + service: VSHNMinio + severity: warning + slo: 'true' + sloth_severity: ticket + syn: 'true' + syn_component: appcat + syn_team: schedar diff --git a/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis.yaml index 27e4324ce..a47ec177e 100644 --- a/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis.yaml @@ -11,7 +11,7 @@ spec: - name: sloth-slo-sli-recordings-appcat-vshn-redis-uptime rules: - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla)) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[5m])) by (service, namespace, name, organization, sla)) labels: @@ -21,7 +21,7 @@ spec: sloth_window: 5m record: slo:sli_error:ratio_rate5m - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla)) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[30m])) by (service, namespace, name, organization, sla)) labels: @@ -31,7 +31,7 @@ spec: sloth_window: 30m record: slo:sli_error:ratio_rate30m - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla)) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[1h])) by (service, namespace, name, organization, sla)) labels: @@ -41,7 +41,7 @@ spec: sloth_window: 1h record: slo:sli_error:ratio_rate1h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla)) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[2h])) by (service, namespace, name, organization, sla)) labels: @@ -51,7 +51,7 @@ spec: sloth_window: 2h record: slo:sli_error:ratio_rate2h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla)) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[6h])) by (service, namespace, name, organization, sla)) labels: @@ -61,7 +61,7 @@ spec: sloth_window: 6h record: slo:sli_error:ratio_rate6h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla)) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[1d])) by (service, namespace, name, organization, sla)) labels: @@ -71,7 +71,7 @@ spec: sloth_window: 1d record: slo:sli_error:ratio_rate1d - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla)) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[3d])) by (service, namespace, name, organization, sla)) labels: diff --git a/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis_ha.yaml b/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis_ha.yaml index 6c5be2275..ef68dad83 100644 --- a/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis_ha.yaml +++ b/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis_ha.yaml @@ -11,7 +11,7 @@ spec: - name: sloth-slo-sli-recordings-appcat-vshn-redis-ha-uptime rules: - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla)) + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[5m])) by (service, namespace, name, organization, sla)) labels: @@ -21,7 +21,7 @@ spec: sloth_window: 5m record: slo:sli_error:ratio_rate5m - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla)) + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[30m])) by (service, namespace, name, organization, sla)) labels: @@ -31,7 +31,7 @@ spec: sloth_window: 30m record: slo:sli_error:ratio_rate30m - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla)) + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[1h])) by (service, namespace, name, organization, sla)) labels: @@ -41,7 +41,7 @@ spec: sloth_window: 1h record: slo:sli_error:ratio_rate1h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla)) + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[2h])) by (service, namespace, name, organization, sla)) labels: @@ -51,7 +51,7 @@ spec: sloth_window: 2h record: slo:sli_error:ratio_rate2h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla)) + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[6h])) by (service, namespace, name, organization, sla)) labels: @@ -61,7 +61,7 @@ spec: sloth_window: 6h record: slo:sli_error:ratio_rate6h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla)) + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[1d])) by (service, namespace, name, organization, sla)) labels: @@ -71,7 +71,7 @@ spec: sloth_window: 1d record: slo:sli_error:ratio_rate1d - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla)) + (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla)) / (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[3d])) by (service, namespace, name, organization, sla)) labels: diff --git a/component/tests/golden/vshn/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml b/component/tests/golden/vshn/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml index 4dabda199..407d3be31 100644 --- a/component/tests/golden/vshn/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml +++ b/component/tests/golden/vshn/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml @@ -28,7 +28,7 @@ spec: value: "true" - name: APPCAT_SLI_VSHNREDIS value: "true" - image: ghcr.io/vshn/appcat:v4.37.0 + image: ghcr.io/vshn/appcat:sli_minio livenessProbe: httpGet: path: /healthz diff --git a/component/tests/golden/vshn/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml b/component/tests/golden/vshn/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml index aea288f1b..36c77088d 100644 --- a/component/tests/golden/vshn/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml +++ b/component/tests/golden/vshn/appcat/appcat/sli_exporter/rbac.authorization.k8s.io_v1_clusterrole_appcat-sliexporter-appcat-sli-exporter.yaml @@ -19,6 +19,20 @@ rules: - get - list - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - vshnminios + verbs: + - get + - list + - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - vshnminios/status + verbs: + - get - apiGroups: - vshn.appcat.vshn.io resources: @@ -47,6 +61,20 @@ rules: - vshnredis/status verbs: - get +- apiGroups: + - vshn.appcat.vshn.io + resources: + - xvshnminios + verbs: + - get + - list + - watch +- apiGroups: + - vshn.appcat.vshn.io + resources: + - xvshnminios/status + verbs: + - get - apiGroups: - vshn.appcat.vshn.io resources: diff --git a/package/main.yaml b/package/main.yaml index f4bd736bc..0e4b712b2 100644 --- a/package/main.yaml +++ b/package/main.yaml @@ -3,11 +3,11 @@ applications: - crossplane parameters: pkg.appcat: - componentVersion: v2.4.3 + componentVersion: add/minio_sla image: registry: ghcr.io repository: vshn/appcat - tag: v4.38.0 + tag: sli_minio components: appcat: url: https://github.com/vshn/component-appcat.git