diff --git a/component/component/common.libsonnet b/component/component/common.libsonnet index 2e6be9510..63bc0611a 100644 --- a/component/component/common.libsonnet +++ b/component/component/common.libsonnet @@ -208,10 +208,10 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional spec: { groups: [ { - name: '%s-general-alerts' % serviceNameLower, + name: '%s-storage' % serviceNameLower, rules: [ { - name: '%s-storage' % serviceNameLower, + alert: serviceName + 'PersistentVolumeFillingUp', annotations: { description: 'The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.', @@ -238,9 +238,13 @@ local generatePrometheusNonSLORules(serviceName, memoryContainerName, additional severity: 'warning', }, }, + ], + }, + { + name: std.asciiLower(serviceName) + '-memory', + rules: [ { alert: serviceName + 'MemoryCritical', - name: std.asciiLower(serviceName) + '-memory', annotations: { description: 'The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.', // runbook_url: 'TBD', diff --git a/component/component/vshn_postgres.jsonnet b/component/component/vshn_postgres.jsonnet index e7b37d56d..bd0269dc8 100644 --- a/component/component/vshn_postgres.jsonnet +++ b/component/component/vshn_postgres.jsonnet @@ -774,7 +774,7 @@ local prometheusRule = common.GeneratePrometheusNonSLORules( alert: 'PostgreSQLConnectionsCritical', annotations: { description: 'The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours.\n Please reduce the load of this instance.', - // runbook_url: 'TBD', + runbook_url: 'https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLConnectionsCritical', summary: 'Connection usage critical', }, @@ -787,7 +787,6 @@ local prometheusRule = common.GeneratePrometheusNonSLORules( }, ], }, - // new { name: 'postgresql-replication', rules: [ diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml index cec83ca65..26f28b658 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml @@ -960,7 +960,7 @@ spec: name: postgresql-rules spec: groups: - - name: postgresql-general-alerts + - name: postgresql-storage rules: - alert: PostgreSQLPersistentVolumeFillingUp annotations: @@ -981,7 +981,6 @@ spec: labels: severity: critical syn_team: schedar - name: postgresql-storage - alert: PostgreSQLPersistentVolumeFillingUp annotations: description: Based on recent sampling, the volume claimed @@ -1003,6 +1002,8 @@ spec: for: 1h labels: severity: warning + - name: postgresql-memory + rules: - alert: PostgreSQLMemoryCritical annotations: description: |- @@ -1017,7 +1018,6 @@ spec: labels: severity: critical syn_team: schedar - name: postgresql-memory - name: postgresql-connections rules: - alert: PostgreSQLConnectionsCritical @@ -1025,6 +1025,7 @@ spec: description: |- The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. Please reduce the load of this instance. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLConnectionsCritical summary: Connection usage critical expr: label_replace( topk(1, sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml index 04e97c2e4..760fde789 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml @@ -1062,7 +1062,7 @@ spec: name: postgresql-rules spec: groups: - - name: postgresql-general-alerts + - name: postgresql-storage rules: - alert: PostgreSQLPersistentVolumeFillingUp annotations: @@ -1083,7 +1083,6 @@ spec: labels: severity: critical syn_team: schedar - name: postgresql-storage - alert: PostgreSQLPersistentVolumeFillingUp annotations: description: Based on recent sampling, the volume claimed @@ -1105,6 +1104,8 @@ spec: for: 1h labels: severity: warning + - name: postgresql-memory + rules: - alert: PostgreSQLMemoryCritical annotations: description: |- @@ -1119,7 +1120,6 @@ spec: labels: severity: critical syn_team: schedar - name: postgresql-memory - name: postgresql-connections rules: - alert: PostgreSQLConnectionsCritical @@ -1127,6 +1127,7 @@ spec: description: |- The number of connections to the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} have been over 90% of the configured connections for 2 hours. Please reduce the load of this instance. + runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#PostgreSQLConnectionsCritical summary: Connection usage critical expr: label_replace( topk(1, sum(pg_stat_activity_count) by (pod, namespace) > 90/100 * sum(pg_settings_max_connections) diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml index e32e8dc3c..61d0d970b 100644 --- a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml +++ b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml @@ -97,7 +97,7 @@ spec: name: redis-rules spec: groups: - - name: redis-general-alerts + - name: redis-storage rules: - alert: redisPersistentVolumeFillingUp annotations: @@ -118,7 +118,6 @@ spec: labels: severity: critical syn_team: schedar - name: redis-storage - alert: redisPersistentVolumeFillingUp annotations: description: Based on recent sampling, the volume claimed @@ -140,6 +139,8 @@ spec: for: 1h labels: severity: warning + - name: redis-memory + rules: - alert: redisMemoryCritical annotations: description: |- @@ -154,7 +155,6 @@ spec: labels: severity: critical syn_team: schedar - name: redis-memory providerConfigRef: name: kubernetes name: prometheusrule diff --git a/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc b/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc index b9d9ed15a..147cc6812 100644 --- a/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc +++ b/docs/modules/ROOT/pages/runbooks/vshn-postgresql.adoc @@ -167,8 +167,15 @@ This alert fires when there are issues with statefullset responsible for replica ``` kubectl describe -n vshn-postgresql- sts -## for exmaple: kubectl -n vshn-postgresql-test-cluster-always-true-jnlj4 describe sts test-cluster-always-true-jnlj4 +## for example: kubectl -n vshn-postgresql-test-cluster-always-true-jnlj4 describe sts test-cluster-always-true-jnlj4 ## get events from affected namespace and look for issues k -n vshn-postgresql-test-cluster-always-true-jnlj4 get events ``` + +[[PostgreSQLConnectionsCritical]] +== PostgreSQLConnectionsCritical + +This alert fires when the used connection is over 90% of the configured `max_connections` limit (defaults to 100). +It means that either the connection limit is set too low or an application is misbehaving and spawning too many connections. +You either need to raise the `max_connections` parameter on the PostgreSQL instance or debug the application, as it might be misbehaving and spawning too many connections.