statisticsnorway · ssb-jnk · Jan 10, 2025 · Jan 10, 2025
diff --git a/.nais/alerts.yaml b/.nais/alerts.yaml
@@ -0,0 +1,75 @@
+apiVersion: "monitoring.coreos.com/v1"
+kind: PrometheusRule
+metadata:
+  name: alert-pseudo-service
+  namespace: dapla-stat
+  labels:
+    team: dapla-stat
+spec:
+  groups:
+    - name: dapla-stat
+      rules:
+        # This alert checks if no replicas of pseudo-service are available, indicating the service is unavailable.
+        - alert: PseudoServiceUnavailable
+          expr: kube_deployment_status_replicas_available{deployment="pseudo-service"} == 0
+          for: 1m
+          annotations:
+            title: "Pseudo-service is unavailable"
+            consequence: "The service is unavailable to users. Immediate investigation required."
+            action: "Check the deployment status and logs for issues."
+          labels:
+            service: pseudo-service
+            namespace: dapla-stat
+            severity: critical
+
+        # This alert detects high CPU usage by calculating the CPU time used over 5 minutes.
+        - alert: HighCPUUsage
+          expr: rate(process_cpu_seconds_total{app="pseudo-service"}[5m]) > 0.8
+          for: 5m
+          annotations:
+            title: "High CPU usage detected"
+            consequence: "The service might experience performance degradation."
+            action: "Investigate the cause of high CPU usage and optimize if necessary."
+          labels:
+            service: pseudo-service
+            namespace: dapla-stat
+            severity: warning
+
+        # This alert checks if memory usage exceeds 90% of the 12GB limit, which could cause instability.
+        - alert: HighMemoryUsage
+          expr: process_resident_memory_bytes{app="pseudo-service"} > (0.9 * 12 * 1024 * 1024 * 1024)
+          for: 5m
+          annotations:
+            title: "High memory usage detected"
+            consequence: "The service might experience instability due to high memory usage."
+            action: "Check memory utilization and consider increasing resources or optimizing the service."
+          labels:
+            service: pseudo-service
+            namespace: dapla-stat
+            severity: warning
+
+        # This alert detects a high number of error logs in pseudo-service.
+        - alert: HighNumberOfErrors
+          expr: (100 * sum by (app, namespace) (rate(log_messages_errors{app="pseudo-service", level=~"Error"}[3m])) / sum by (app, namespace) (rate(log_messages_total{app="pseudo-service"}[3m]))) > 10
+          for: 3m
+          annotations:
+            title: "High number of errors logged in pseudo-service"
+            consequence: "The application is logging a significant number of errors."
+            action: "Check the service logs for errors and address the root cause."
+          labels:
+            service: pseudo-service
+            namespace: dapla-stat
+            severity: critical
+
+        # This alert monitors the number of pod restarts for pseudo-service and triggers if more than 3 restarts occur within 15 minutes.
+        - alert: HighPodRestarts
+          expr: increase(kube_pod_container_status_restarts_total{namespace="dapla-stat", app="pseudo-service"}[15m]) > 3
+          for: 15m
+          annotations:
+            title: "High number of pod restarts"
+            consequence: "The service may be unstable or misconfigured."
+            action: "Investigate the cause of pod restarts and fix configuration or resource issues."
+          labels:
+            service: pseudo-service
+            namespace: dapla-stat
+            severity: warning