From 1f1e0b2d047de3f39929a2c64c8672460e409f71 Mon Sep 17 00:00:00 2001 From: Nicolas Bigler Date: Tue, 3 Oct 2023 17:59:15 +0200 Subject: [PATCH] Add redis sla reporting and fix non-working maintenance exclusion Signed-off-by: Nicolas Bigler --- class/defaults.yml | 3 +- component/main.jsonnet | 2 +- component/slos.libsonnet | 10 +++--- component/vshn_redis.jsonnet | 3 ++ .../10_appcat_maintenance_recording_rule.yaml | 4 +-- .../10_appcat_maintenance_recording_rule.yaml | 4 +-- .../10_appcat_maintenance_recording_rule.yaml | 4 +-- .../controllers/appcat/30_deployment.yaml | 2 +- .../10_appcat_maintenance_recording_rule.yaml | 4 +-- ...appcat-sliexporter-controller-manager.yaml | 2 +- .../10_appcat_maintenance_recording_rule.yaml | 4 +-- .../10_appcat_maintenance_recording_rule.yaml | 4 +-- .../appcat/21_composition_vshn_minio.yaml | 2 +- .../controllers/appcat/30_deployment.yaml | 2 +- .../appcat/sla_reporter/01_cronjob.yaml | 2 +- ...appcat-sliexporter-controller-manager.yaml | 2 +- .../10_appcat_maintenance_recording_rule.yaml | 4 +-- ...appcat-sliexporter-controller-manager.yaml | 2 +- .../10_appcat_maintenance_recording_rule.yaml | 4 +-- .../appcat/21_composition_vshn_minio.yaml | 2 +- .../appcat/21_composition_vshn_postgres.yaml | 2 +- .../21_composition_vshn_postgresrestore.yaml | 2 +- .../appcat/21_composition_vshn_redis.yaml | 2 +- .../appcat/appcat/22_prom_rule_sla_redis.yaml | 16 ++++++++++ .../controllers/appcat/30_deployment.yaml | 2 +- .../appcat/sla_reporter/01_cronjob.yaml | 2 +- .../sli_exporter/90_slo_vshn_postgresql.yaml | 32 +++++++++---------- .../sli_exporter/90_slo_vshn_redis.yaml | 28 ++++++++-------- ...appcat-sliexporter-controller-manager.yaml | 2 +- 29 files changed, 87 insertions(+), 67 deletions(-) create mode 100644 tests/golden/vshn/appcat/appcat/22_prom_rule_sla_redis.yaml diff --git a/class/defaults.yml b/class/defaults.yml index 611c26805..52bf0814b 100644 --- a/class/defaults.yml +++ b/class/defaults.yml @@ -39,7 +39,7 @@ parameters: appcat: registry: ghcr.io repository: vshn/appcat - tag: v4.33.0 + tag: v4.34.0 apiserver: registry: ghcr.io repository: vshn/appcat-apiserver @@ -381,6 +381,7 @@ parameters: bucket_region: "lpg" grpcEndpoint: ${appcat:grpcEndpoint} defaultPlan: standard-1 + sla: 99.25 plans: standard-512m: size: diff --git a/component/main.jsonnet b/component/main.jsonnet index 3d39ddee1..6979e1d3d 100644 --- a/component/main.jsonnet +++ b/component/main.jsonnet @@ -122,7 +122,7 @@ local maintenanceRule = kube._Object('monitoring.coreos.com/v1', 'PrometheusRule name: 'appcat-cluster-maintenance', rules: [ { - expr: 'scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) or vector(0))', + expr: 'max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) or vector(0)', record: 'appcat:cluster:maintenance', }, ], diff --git a/component/slos.libsonnet b/component/slos.libsonnet index 4d4b1f8a9..76bd779a2 100644 --- a/component/slos.libsonnet +++ b/component/slos.libsonnet @@ -66,12 +66,12 @@ local prometheusRule(name) = sli: { events: { // The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error - error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0', - total_query: '(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0', + error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0', + total_query: '(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0', }, }, alerting+: { - name: 'SLO_AppCat_VSHNPosgtreSQLUptime', + name: 'SLO_AppCat_VSHNPostgreSQLUptime', annotations+: { summary: 'Probes to PostgreSQL by VSHN instance fail', }, @@ -110,8 +110,8 @@ local prometheusRule(name) = sli: { events: { // The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error - error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0', - total_query: '(sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0', + error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0', + total_query: '(sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0', }, }, alerting+: { diff --git a/component/vshn_redis.jsonnet b/component/vshn_redis.jsonnet index 732724c28..96dd09f66 100644 --- a/component/vshn_redis.jsonnet +++ b/component/vshn_redis.jsonnet @@ -45,6 +45,8 @@ local xrd = xrds.XRDFromCRD( connectionSecretKeys=connectionSecretKeys, ) + xrds.WithPlanDefaults(redisPlans, redisParams.defaultPlan); +local promRuleRedisSLA = common.PromRuleSLA(params.services.vshn.redis.sla, 'VSHNRedis'); + local restoreServiceAccount = kube.ServiceAccount('redisrestoreserviceaccount') + { metadata+: { namespace: params.services.controlNamespace, @@ -691,5 +693,6 @@ if params.services.vshn.enabled && redisParams.enabled then { '20_rbac_vshn_redis_resize': [ resizeClusterRole, resizeServiceAccount, resizeClusterRoleBinding ], '20_plans_vshn_redis': plansCM, '21_composition_vshn_redis': composition, + '22_prom_rule_sla_redis': promRuleRedisSLA, [if isOpenshift then '21_openshift_template_redis_vshn']: osTemplate, } else {} diff --git a/tests/golden/apiserver/appcat/appcat/10_appcat_maintenance_recording_rule.yaml b/tests/golden/apiserver/appcat/appcat/10_appcat_maintenance_recording_rule.yaml index d4051f79a..534448600 100644 --- a/tests/golden/apiserver/appcat/appcat/10_appcat_maintenance_recording_rule.yaml +++ b/tests/golden/apiserver/appcat/appcat/10_appcat_maintenance_recording_rule.yaml @@ -10,6 +10,6 @@ spec: groups: - name: appcat-cluster-maintenance rules: - - expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) - or vector(0)) + - expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) + or vector(0) record: appcat:cluster:maintenance diff --git a/tests/golden/cloudscale/appcat/appcat/10_appcat_maintenance_recording_rule.yaml b/tests/golden/cloudscale/appcat/appcat/10_appcat_maintenance_recording_rule.yaml index d4051f79a..534448600 100644 --- a/tests/golden/cloudscale/appcat/appcat/10_appcat_maintenance_recording_rule.yaml +++ b/tests/golden/cloudscale/appcat/appcat/10_appcat_maintenance_recording_rule.yaml @@ -10,6 +10,6 @@ spec: groups: - name: appcat-cluster-maintenance rules: - - expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) - or vector(0)) + - expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) + or vector(0) record: appcat:cluster:maintenance diff --git a/tests/golden/controllers/appcat/appcat/10_appcat_maintenance_recording_rule.yaml b/tests/golden/controllers/appcat/appcat/10_appcat_maintenance_recording_rule.yaml index d4051f79a..534448600 100644 --- a/tests/golden/controllers/appcat/appcat/10_appcat_maintenance_recording_rule.yaml +++ b/tests/golden/controllers/appcat/appcat/10_appcat_maintenance_recording_rule.yaml @@ -10,6 +10,6 @@ spec: groups: - name: appcat-cluster-maintenance rules: - - expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) - or vector(0)) + - expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) + or vector(0) record: appcat:cluster:maintenance diff --git a/tests/golden/controllers/appcat/appcat/controllers/appcat/30_deployment.yaml b/tests/golden/controllers/appcat/appcat/controllers/appcat/30_deployment.yaml index 2f4709a28..b2e495054 100644 --- a/tests/golden/controllers/appcat/appcat/controllers/appcat/30_deployment.yaml +++ b/tests/golden/controllers/appcat/appcat/controllers/appcat/30_deployment.yaml @@ -23,7 +23,7 @@ spec: env: - name: PLANS_NAMESPACE value: syn-appcat - image: ghcr.io/vshn/appcat:v4.33.0 + image: ghcr.io/vshn/appcat:v4.34.0 livenessProbe: httpGet: path: /healthz diff --git a/tests/golden/defaults/appcat/appcat/10_appcat_maintenance_recording_rule.yaml b/tests/golden/defaults/appcat/appcat/10_appcat_maintenance_recording_rule.yaml index d4051f79a..534448600 100644 --- a/tests/golden/defaults/appcat/appcat/10_appcat_maintenance_recording_rule.yaml +++ b/tests/golden/defaults/appcat/appcat/10_appcat_maintenance_recording_rule.yaml @@ -10,6 +10,6 @@ spec: groups: - name: appcat-cluster-maintenance rules: - - expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) - or vector(0)) + - expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) + or vector(0) record: appcat:cluster:maintenance diff --git a/tests/golden/defaults/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml b/tests/golden/defaults/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml index beee4840e..0a67b8bd2 100644 --- a/tests/golden/defaults/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml +++ b/tests/golden/defaults/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml @@ -28,7 +28,7 @@ spec: value: "false" - name: APPCAT_SLI_VSHNREDIS value: "false" - image: ghcr.io/vshn/appcat:v4.33.0 + image: ghcr.io/vshn/appcat:v4.34.0 livenessProbe: httpGet: path: /healthz diff --git a/tests/golden/exoscale/appcat/appcat/10_appcat_maintenance_recording_rule.yaml b/tests/golden/exoscale/appcat/appcat/10_appcat_maintenance_recording_rule.yaml index d4051f79a..534448600 100644 --- a/tests/golden/exoscale/appcat/appcat/10_appcat_maintenance_recording_rule.yaml +++ b/tests/golden/exoscale/appcat/appcat/10_appcat_maintenance_recording_rule.yaml @@ -10,6 +10,6 @@ spec: groups: - name: appcat-cluster-maintenance rules: - - expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) - or vector(0)) + - expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) + or vector(0) record: appcat:cluster:maintenance diff --git a/tests/golden/minio/appcat/appcat/10_appcat_maintenance_recording_rule.yaml b/tests/golden/minio/appcat/appcat/10_appcat_maintenance_recording_rule.yaml index d4051f79a..534448600 100644 --- a/tests/golden/minio/appcat/appcat/10_appcat_maintenance_recording_rule.yaml +++ b/tests/golden/minio/appcat/appcat/10_appcat_maintenance_recording_rule.yaml @@ -10,6 +10,6 @@ spec: groups: - name: appcat-cluster-maintenance rules: - - expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) - or vector(0)) + - expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) + or vector(0) record: appcat:cluster:maintenance diff --git a/tests/golden/minio/appcat/appcat/21_composition_vshn_minio.yaml b/tests/golden/minio/appcat/appcat/21_composition_vshn_minio.yaml index ad1443103..90075b948 100644 --- a/tests/golden/minio/appcat/appcat/21_composition_vshn_minio.yaml +++ b/tests/golden/minio/appcat/appcat/21_composition_vshn_minio.yaml @@ -25,7 +25,7 @@ spec: data: controlNamespace: syn-appcat-control defaultPlan: standard-1 - imageTag: v4.33.0 + imageTag: v4.34.0 maintenanceSA: helm-based-service-maintenance minioChartRepository: https://charts.min.io minioChartVersion: 5.0.13 diff --git a/tests/golden/minio/appcat/appcat/controllers/appcat/30_deployment.yaml b/tests/golden/minio/appcat/appcat/controllers/appcat/30_deployment.yaml index 2f4709a28..b2e495054 100644 --- a/tests/golden/minio/appcat/appcat/controllers/appcat/30_deployment.yaml +++ b/tests/golden/minio/appcat/appcat/controllers/appcat/30_deployment.yaml @@ -23,7 +23,7 @@ spec: env: - name: PLANS_NAMESPACE value: syn-appcat - image: ghcr.io/vshn/appcat:v4.33.0 + image: ghcr.io/vshn/appcat:v4.34.0 livenessProbe: httpGet: path: /healthz diff --git a/tests/golden/minio/appcat/appcat/sla_reporter/01_cronjob.yaml b/tests/golden/minio/appcat/appcat/sla_reporter/01_cronjob.yaml index 8a3e895f2..6358d8b55 100644 --- a/tests/golden/minio/appcat/appcat/sla_reporter/01_cronjob.yaml +++ b/tests/golden/minio/appcat/appcat/sla_reporter/01_cronjob.yaml @@ -30,7 +30,7 @@ spec: envFrom: - secretRef: name: appcat-sla-reports-creds - image: ghcr.io/vshn/appcat:v4.33.0 + image: ghcr.io/vshn/appcat:v4.34.0 name: sla-reporter resources: limits: diff --git a/tests/golden/minio/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml b/tests/golden/minio/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml index beee4840e..0a67b8bd2 100644 --- a/tests/golden/minio/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml +++ b/tests/golden/minio/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml @@ -28,7 +28,7 @@ spec: value: "false" - name: APPCAT_SLI_VSHNREDIS value: "false" - image: ghcr.io/vshn/appcat:v4.33.0 + image: ghcr.io/vshn/appcat:v4.34.0 livenessProbe: httpGet: path: /healthz diff --git a/tests/golden/openshift/appcat/appcat/10_appcat_maintenance_recording_rule.yaml b/tests/golden/openshift/appcat/appcat/10_appcat_maintenance_recording_rule.yaml index d4051f79a..534448600 100644 --- a/tests/golden/openshift/appcat/appcat/10_appcat_maintenance_recording_rule.yaml +++ b/tests/golden/openshift/appcat/appcat/10_appcat_maintenance_recording_rule.yaml @@ -10,6 +10,6 @@ spec: groups: - name: appcat-cluster-maintenance rules: - - expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) - or vector(0)) + - expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) + or vector(0) record: appcat:cluster:maintenance diff --git a/tests/golden/openshift/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml b/tests/golden/openshift/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml index beee4840e..0a67b8bd2 100644 --- a/tests/golden/openshift/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml +++ b/tests/golden/openshift/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml @@ -28,7 +28,7 @@ spec: value: "false" - name: APPCAT_SLI_VSHNREDIS value: "false" - image: ghcr.io/vshn/appcat:v4.33.0 + image: ghcr.io/vshn/appcat:v4.34.0 livenessProbe: httpGet: path: /healthz diff --git a/tests/golden/vshn/appcat/appcat/10_appcat_maintenance_recording_rule.yaml b/tests/golden/vshn/appcat/appcat/10_appcat_maintenance_recording_rule.yaml index d4051f79a..534448600 100644 --- a/tests/golden/vshn/appcat/appcat/10_appcat_maintenance_recording_rule.yaml +++ b/tests/golden/vshn/appcat/appcat/10_appcat_maintenance_recording_rule.yaml @@ -10,6 +10,6 @@ spec: groups: - name: appcat-cluster-maintenance rules: - - expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) - or vector(0)) + - expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) + or vector(0) record: appcat:cluster:maintenance diff --git a/tests/golden/vshn/appcat/appcat/21_composition_vshn_minio.yaml b/tests/golden/vshn/appcat/appcat/21_composition_vshn_minio.yaml index 8a9c2234f..823c9051f 100644 --- a/tests/golden/vshn/appcat/appcat/21_composition_vshn_minio.yaml +++ b/tests/golden/vshn/appcat/appcat/21_composition_vshn_minio.yaml @@ -25,7 +25,7 @@ spec: data: controlNamespace: syn-appcat-control defaultPlan: standard-1 - imageTag: v4.33.0 + imageTag: v4.34.0 maintenanceSA: helm-based-service-maintenance minioChartRepository: https://charts.min.io minioChartVersion: 5.0.13 diff --git a/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index 0f2a9539d..b66fa9a40 100644 --- a/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -33,7 +33,7 @@ spec: emailAlertingSmtpHost: smtp.eu.mailgun.org:465 emailAlertingSmtpUsername: myuser@example.com externalDatabaseConnectionsEnabled: 'true' - imageTag: v4.33.0 + imageTag: v4.34.0 quotasEnabled: 'false' sgNamespace: stackgres sideCars: '{"clusterController": {"limits": {"cpu": "600m", "memory": "768Mi"}, diff --git a/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index 32b8c4284..379a4aeee 100644 --- a/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -33,7 +33,7 @@ spec: emailAlertingSmtpHost: smtp.eu.mailgun.org:465 emailAlertingSmtpUsername: myuser@example.com externalDatabaseConnectionsEnabled: 'true' - imageTag: v4.33.0 + imageTag: v4.34.0 quotasEnabled: 'false' sgNamespace: stackgres sideCars: '{"clusterController": {"limits": {"cpu": "600m", "memory": "768Mi"}, diff --git a/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index d067811e4..1a9c3f1fd 100644 --- a/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -30,7 +30,7 @@ spec: data: bucketRegion: lpg controlNamespace: syn-appcat-control - imageTag: v4.33.0 + imageTag: v4.34.0 maintenanceSA: helm-based-service-maintenance quotasEnabled: 'false' restoreSA: redisrestoreserviceaccount diff --git a/tests/golden/vshn/appcat/appcat/22_prom_rule_sla_redis.yaml b/tests/golden/vshn/appcat/appcat/22_prom_rule_sla_redis.yaml new file mode 100644 index 000000000..8ddcd1c0a --- /dev/null +++ b/tests/golden/vshn/appcat/appcat/22_prom_rule_sla_redis.yaml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: {} + labels: + name: vshn-vshnredis-sla + name: vshn-vshnredis-sla + namespace: appcat-slos +spec: + groups: + - name: appcat-vshnredis-sla-target + rules: + - expr: vector(99.25) + labels: + service: VSHNRedis + record: sla:objective:ratio diff --git a/tests/golden/vshn/appcat/appcat/controllers/appcat/30_deployment.yaml b/tests/golden/vshn/appcat/appcat/controllers/appcat/30_deployment.yaml index 2f4709a28..b2e495054 100644 --- a/tests/golden/vshn/appcat/appcat/controllers/appcat/30_deployment.yaml +++ b/tests/golden/vshn/appcat/appcat/controllers/appcat/30_deployment.yaml @@ -23,7 +23,7 @@ spec: env: - name: PLANS_NAMESPACE value: syn-appcat - image: ghcr.io/vshn/appcat:v4.33.0 + image: ghcr.io/vshn/appcat:v4.34.0 livenessProbe: httpGet: path: /healthz diff --git a/tests/golden/vshn/appcat/appcat/sla_reporter/01_cronjob.yaml b/tests/golden/vshn/appcat/appcat/sla_reporter/01_cronjob.yaml index 722e3c3ee..e47b4a6b6 100644 --- a/tests/golden/vshn/appcat/appcat/sla_reporter/01_cronjob.yaml +++ b/tests/golden/vshn/appcat/appcat/sla_reporter/01_cronjob.yaml @@ -30,7 +30,7 @@ spec: envFrom: - secretRef: name: appcat-sla-reports-creds - image: ghcr.io/vshn/appcat:v4.33.0 + image: ghcr.io/vshn/appcat:v4.34.0 name: sla-reporter resources: limits: diff --git a/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_postgresql.yaml b/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_postgresql.yaml index bcf813298..c910ff186 100644 --- a/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_postgresql.yaml +++ b/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_postgresql.yaml @@ -11,9 +11,9 @@ spec: - name: sloth-slo-sli-recordings-appcat-vshn-postgresql-uptime rules: - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-postgresql-uptime sloth_service: appcat-vshn-postgresql @@ -21,9 +21,9 @@ spec: sloth_window: 5m record: slo:sli_error:ratio_rate5m - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-postgresql-uptime sloth_service: appcat-vshn-postgresql @@ -31,9 +31,9 @@ spec: sloth_window: 30m record: slo:sli_error:ratio_rate30m - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-postgresql-uptime sloth_service: appcat-vshn-postgresql @@ -41,9 +41,9 @@ spec: sloth_window: 1h record: slo:sli_error:ratio_rate1h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-postgresql-uptime sloth_service: appcat-vshn-postgresql @@ -51,9 +51,9 @@ spec: sloth_window: 2h record: slo:sli_error:ratio_rate2h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-postgresql-uptime sloth_service: appcat-vshn-postgresql @@ -61,9 +61,9 @@ spec: sloth_window: 6h record: slo:sli_error:ratio_rate6h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-postgresql-uptime sloth_service: appcat-vshn-postgresql @@ -71,9 +71,9 @@ spec: sloth_window: 1d record: slo:sli_error:ratio_rate1d - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-postgresql-uptime sloth_service: appcat-vshn-postgresql @@ -147,7 +147,7 @@ spec: record: sloth_slo_info - name: sloth-slo-alerts-appcat-vshn-postgresql-uptime rules: - - alert: SLO_AppCat_VSHNPosgtreSQLUptime + - alert: SLO_AppCat_VSHNPostgreSQLUptime annotations: for: 6m summary: Probes to PostgreSQL by VSHN instance fail @@ -176,7 +176,7 @@ spec: syn: 'true' syn_component: appcat syn_team: schedar - - alert: SLO_AppCat_VSHNPosgtreSQLUptime + - alert: SLO_AppCat_VSHNPostgreSQLUptime annotations: runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#uptime summary: Probes to PostgreSQL by VSHN instance fail diff --git a/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis.yaml b/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis.yaml index 782f812c0..17926ea5c 100644 --- a/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis.yaml +++ b/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_redis.yaml @@ -11,9 +11,9 @@ spec: - name: sloth-slo-sli-recordings-appcat-vshn-redis-uptime rules: - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-redis-uptime sloth_service: appcat-vshn-redis @@ -21,9 +21,9 @@ spec: sloth_window: 5m record: slo:sli_error:ratio_rate5m - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-redis-uptime sloth_service: appcat-vshn-redis @@ -31,9 +31,9 @@ spec: sloth_window: 30m record: slo:sli_error:ratio_rate30m - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-redis-uptime sloth_service: appcat-vshn-redis @@ -41,9 +41,9 @@ spec: sloth_window: 1h record: slo:sli_error:ratio_rate1h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-redis-uptime sloth_service: appcat-vshn-redis @@ -51,9 +51,9 @@ spec: sloth_window: 2h record: slo:sli_error:ratio_rate2h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-redis-uptime sloth_service: appcat-vshn-redis @@ -61,9 +61,9 @@ spec: sloth_window: 6h record: slo:sli_error:ratio_rate6h - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-redis-uptime sloth_service: appcat-vshn-redis @@ -71,9 +71,9 @@ spec: sloth_window: 1d record: slo:sli_error:ratio_rate1d - expr: | - ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) / - ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0) + ((sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0) labels: sloth_id: appcat-vshn-redis-uptime sloth_service: appcat-vshn-redis diff --git a/tests/golden/vshn/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml b/tests/golden/vshn/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml index f693afe2a..d346447b3 100644 --- a/tests/golden/vshn/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml +++ b/tests/golden/vshn/appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml @@ -28,7 +28,7 @@ spec: value: "true" - name: APPCAT_SLI_VSHNREDIS value: "true" - image: ghcr.io/vshn/appcat:v4.33.0 + image: ghcr.io/vshn/appcat:v4.34.0 livenessProbe: httpGet: path: /healthz