stackrox · pepedocs · Jul 20, 2023 · stehessel · Aug 23, 2023 · stehessel
@@ -454,6 +454,78 @@ spec:
             )
           record: central:sli:availability:extended_avg_over_time28d
 
+        # - Queries the 90th percentile of central's handled GRPC/HTTP API requests latencies over the last 10 minutes.
+        - expr: |
+            (histogram_quantile(0.9, sum  by(le, namespace, grpc_service, grpc_method) (rate(grpc_server_handling_seconds_bucket{container="central", grpc_method!~"ScanImageInternal|DeleteImages|EnrichLocalImageInternal|RunReport|ScanImage|TriggerExternalBackup|Ping"}[10m]))) > 0) < bool 0.1
+          record: central:grpc_server_handling_seconds:rate10m:p90
+        - expr: |
+            (histogram_quantile(0.9, sum  by(le, namespace, path) (rate(http_incoming_request_duration_histogram_seconds_bucket{container="central", code!~"5.*|4.*", path!~"/api/extensions/scannerdefinitions|/api/graphql|/sso/|/|/api/cli/download/"}[10m]))) > 0) > 0.1
+          record: central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90
+
+        # - Queries the current central API latency (GRPC and HTTP) SLI by calculating the ratio of successful
+        #   instances of central:xxx:rate10m:p90 over its total instances for a certain period.
+        # - Note that to get the current SLI with a variable PERIOD, simply run the following query where PERIOD is the desired period in
+        #   promql duration format. This query is useful for dynamically determining an SLI regardless of an SLO.
+        #
+        #   sum_over_time(central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90[PERIOD]) / count_over_time(central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90[PERIOD])
+        #
+        - expr: |
+            sum_over_time(central:grpc_server_handling_seconds:rate10m:p90[28d]) / count_over_time(central:grpc_server_handling_seconds:rate10m:p90[28d])
+          record: central:grpc_server_handling_seconds:rate10m:p90:sli
+        - expr: |
+            sum_over_time(central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90[28d]) / count_over_time(central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90[28d])
+          record: central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:sli
+
+        # - Queries the error rate or the ratio of the instances of central:xxx:rate10m:p90
+        #   were equal to 0 over the total instances of central:xxx:rate10m:p90 within a period.
+        - expr: |
+            1 - central:grpc_server_handling_seconds:rate10m:p90:sli
+          record: central:grpc_server_handling_seconds:rate10m:p90:error_rate28d
+        - expr: |
+            1 - central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:sli
+          record: central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:error_rate28d
+
+        # - Queries error rate for a 1h window.
+        - expr: |
+            1 - (sum_over_time(central:grpc_server_handling_seconds:rate10m:p90[1h]) / count_over_time(central:grpc_server_handling_seconds:rate10m:p90[1h]))
+          record: central:grpc_server_handling_seconds:rate10m:p90:error_rate1h
+        - expr: |
+            1 - (sum_over_time(central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90[1h]) / count_over_time(central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90[1h]))
+          record: central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:error_rate1h
+
+        # - Queries the error budget exhaustion (or consumption) for the whole slo window (28d).
+        - expr: |
+            (1 - central:grpc_server_handling_seconds:rate10m:p90:sli) / 0.01
+          record: central:grpc_server_handling_seconds:rate10m:p90:error_budget_exhaustion28d
+        - expr: |
+            (1 - central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:sli) / 0.01
+          record: central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:error_budget_exhaustion28d
+
+        # - Queries error budget burn rate (a.k.a. burn rate) is the ratio of central:xxx:rate10m:p90:error_rateyyy
+        #   over the error budget for a period (e.g. 1h, 1d, etc).
+        - expr: |
+            central:grpc_server_handling_seconds:rate10m:p90:error_rate1h / 0.01
+          record: central:grpc_server_handling_seconds:rate10m:p90:burn_rate1h
+        - expr: |
+            central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:error_rate1h / 0.01
+          record: central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:burn_rate1h
+
+        # - A sample count filter that ignores central:xxx:rate10m:p90 instances that has samples less than the expected sample count.
+        # - The expected count of 10m samples of central:xxx:rate10m:p90 over 28 days (e.g. 28d/10m) is equal to 4032.
+        # - The expected count of 10m samples of central:xxx:rate10m:p90 over an hour (e.g. 1h / 10m) is equal to 6.
+        - expr: |
+            (count_over_time(central:grpc_server_handling_seconds:rate10m:p90[28d]) >= 4032) / count_over_time(central:grpc_server_handling_seconds:rate10m:p90[28d])
+          record: central:grpc_server_handling_seconds:rate10m:p90:sample_count_filter28d
+        - expr: |
+            (count_over_time(central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90[28d]) >= 4032) / count_over_time(central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90[28d])
+          record: central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:sample_count_filter28d
+        - expr: |
+            (count_over_time(central:grpc_server_handling_seconds:rate10m:p90[1h]) >= 6) / count_over_time(central:grpc_server_handling_seconds:rate10m:p90[1h])
+          record: central:grpc_server_handling_seconds:rate10m:p90:sample_count_filter1h
+        - expr: |
+            (count_over_time(central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90[1h]) >= 6) / (count_over_time(central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90[1h]))
+          record: central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:sample_count_filter1h
+
     - name: rhacs-central.slo
       rules:
         # Availability SLO
@@ -533,6 +605,11 @@ spec:
             severity: critical
             namespace: "{{ $labels.namespace }}"
             rhacs_instance_id: "{{ $labels.rhacs_instance_id }}"
+            rhacs_org_name: "{{ $labels.rhacs_org_name }}"
+            rhacs_org_id: "{{ $labels.rhacs_org_id }}"
+            rhacs_cluster_name: "{{ $labels.rhacs_cluster_name }}"
+            rhacs_environment: "{{ $labels.rhacs_environment }}"
+
     - name: az-resources
       rules:
         - record: strictly_worker_nodes
@@ -639,3 +716,96 @@ spec:
             summary: "There is a high risk of over-committing CPU resources on worker nodes in AZ {{ $labels.availability_zone }}."
             description: "During the last 5 minutes, the average CPU limit commitment on worker nodes in AZ {{ $labels.availability_zone }} was {{ $value | humanizePercentage }}. This is above the recommended threshold of 200%."
             sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
+
+        - alert: Central latency error budget exhaustion for GRPC API - 90%
+          annotations:
+            message: "Latency error budget exhaustion for central's GRPC API. Current exhaustion: {{ $value | humanizePercentage }}."
+          expr: |
+            (central:grpc_server_handling_seconds:rate10m:p90:sample_count_filter28d * central:grpc_server_handling_seconds:rate10m:p90:error_budget_exhaustion28d) >= 0.9
+          labels:
+            service: central
+            namespace: "{{ $labels.namespace }}"
+            rhacs_instance_id: "{{ $labels.namespace }}"
+            grpc_service: "{{ $labels.grpc_service }}"
+            grpc_method: "{{ $labels.grpc_method }}"
+            severity: critical
+        - alert: Central latency error budget exhaustion for GRPC API - 70%
+          annotations:
+            message: "Latency error budget exhaustion for central's GRPC API. Current exhaustion: {{ $value | humanizePercentage }}."
+          expr: |
+            (central:grpc_server_handling_seconds:rate10m:p90:sample_count_filter28d * central:grpc_server_handling_seconds:rate10m:p90:error_budget_exhaustion28d) >= 0.7
+          labels:
+            service: central
+            namespace: "{{ $labels.namespace }}"
+            rhacs_instance_id: "{{ $labels.namespace }}"
+            grpc_service: "{{ $labels.grpc_service }}"
+            grpc_method: "{{ $labels.grpc_method }}"
+            severity: warning
+        - alert: Central latency error budget exhaustion for GRPC API - 50%
+          annotations:
+            message: "Latency error budget exhaustion for central's GRPC API. Current exhaustion: {{ $value | humanizePercentage }}."
+          expr: |
+            (central:grpc_server_handling_seconds:rate10m:p90:sample_count_filter28d * central:grpc_server_handling_seconds:rate10m:p90:error_budget_exhaustion28d) >= 0.5
+          labels:
+            service: central
+            namespace: "{{ $labels.namespace }}"
+            rhacs_instance_id: "{{ $labels.namespace }}"
+            grpc_service: "{{ $labels.grpc_service }}"
+            grpc_method: "{{ $labels.grpc_method }}"
+            severity: warning
+        - alert: Central latency error budget exhaustion for HTTP API - 90%
+          annotations:
+            message: "Latency error budget exhaustion for central's HTTP API. Current exhaustion: {{ $value | humanizePercentage }}."
+          expr: |
+            (central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:sample_count_filter28d * central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:error_budget_exhaustion28d) >= 0.9
+          labels:
+            service: central
+            namespace: "{{ $labels.namespace }}"
+            rhacs_instance_id: "{{ $labels.namespace }}"
+            path: "{{ $labels.path }}"
+            severity: critical
+        - alert: Central latency error budget exhaustion for HTTP API - 70%
+          annotations:
+            message: "Latency error budget exhaustion for central's HTTP API. Current exhaustion: {{ $value | humanizePercentage }}."
+          expr: |
+            (central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:sample_count_filter28d * central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:error_budget_exhaustion28d) >= 0.7
+          labels:
+            service: central
+            namespace: "{{ $labels.namespace }}"
+            rhacs_instance_id: "{{ $labels.namespace }}"
+            path: "{{ $labels.path }}"
+            severity: warning
+        - alert: Central latency error budget exhaustion for HTTP API - 50%
+          annotations:
+            message: "Latency error budget exhaustion for central's HTTP API. Current exhaustion: {{ $value | humanizePercentage }}."
+          expr: |
+            (central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:sample_count_filter28d * central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:error_budget_exhaustion28d) >= 0.5
+          labels:
+            service: central
+            namespace: "{{ $labels.namespace }}"
+            rhacs_instance_id: "{{ $labels.namespace }}"
+            path: "{{ $labels.path }}"
+            severity: warning
+        - alert: Central latency burn rate for GRPC API
+          annotations:
+            message: "Latency  burn rate for central's GRPC API. Current burn rate per hour: {{ $value | humanize }}."
+          expr: |
+            (central:grpc_server_handling_seconds:rate10m:p90:sample_count_filter1h * central:grpc_server_handling_seconds:rate10m:p90:burn_rate1h) >= 0.5
+          labels:
+            service: central
+            namespace: "{{ $labels.namespace }}"
+            rhacs_instance_id: "{{ $labels.namespace }}"
+            grpc_service: "{{ $labels.grpc_service }}"
+            grpc_method: "{{ $labels.grpc_method }}"
+            severity: warning
+        - alert: Central latency burn rate for HTTP API
+          annotations:
+            message: "Latency  burn rate for central's HTTP API. Current burn rate per hour: {{ $value | humanize }}."
+          expr: |
+            (central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:sample_count_filter1h * central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90:burn_rate1h) >= 0.5
+          labels:
+            service: central
+            namespace: "{{ $labels.namespace }}"
+            rhacs_instance_id: "{{ $labels.namespace }}"
+            path: "{{ $labels.path }}"
+            severity: warning
@@ -186,3 +186,137 @@ tests:
             exp_annotations:
               message: "High availability burn rate for central. Current burn rate per hour: 59.17."
               sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
+
+  # Test central GRPC/HTTP API latency alerts and rules
+  - interval: 10m
+    input_series:
+      - series: central:grpc_server_handling_seconds:rate10m:p90{namespace="rhacs-abc", grpc_service="grpcsvc", grpc_method="grpcmeth"}
+        values: 1+0x4000
+      - series: central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90{namespace="rhacs-abc", grpc_service="grpcsvc", grpc_method="grpcmeth"}
+        values: 1+0x4000
+    alert_rule_test:
+      # Ensure alert for a 28d window doesn't fire if there aren't enough SLI samples.
+      - eval_time: 28d
+        alertname: Central latency error budget exhaustion for GRPC API - 90%
+        exp_alerts: []
+      # Ensure alert for a 28d window doesn't fire if there aren't enough SLI samples.
+      - eval_time: 28d
+        alertname: Central latency error budget exhaustion for HTTP API - 90%
+        exp_alerts: []
+  - interval: 10m
+    input_series:
+      - series: central:grpc_server_handling_seconds:rate10m:p90{namespace="rhacs-abc", grpc_service="grpcsvc", grpc_method="grpcmeth"}
+        values: "1+0x3994 0+0x36"
+      - series: central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90{namespace="rhacs-abc", grpc_service="grpcsvc", grpc_method="grpcmeth"}
+        values: "1+0x3994 0+0x36"
+    alert_rule_test:
+      - eval_time: 28d
+        alertname: Central latency error budget exhaustion for GRPC API - 90%
+        exp_alerts:
+          - exp_labels:
+              alertname: Central availability error budget exhaustion - 90%
+              namespace: rhacs-abc
+              rhacs_instance_id: rhacs-abc
+              service: central
+              grpc_service: grpcsvc
+              grpc_method: grpcmeth
+              severity: critical
+            exp_annotations:
+              message: "Latency error budget exhaustion for central's GRPC API. Current exhaustion: 91.77%."
+      - eval_time: 28d
+        alertname: Central latency error budget exhaustion for GRPC API - 70%
+        exp_alerts:
+          - exp_labels:
+              alertname: Central availability error budget exhaustion - 70%
+              namespace: rhacs-abc
+              rhacs_instance_id: rhacs-abc
+              service: central
+              grpc_service: grpcsvc
+              grpc_method: grpcmeth
+              severity: warning
+            exp_annotations:
+              message: "Latency error budget exhaustion for central's GRPC API. Current exhaustion: 91.77%."
+      - eval_time: 28d
+        alertname: Central latency error budget exhaustion for GRPC API - 50%
+        exp_alerts:
+          - exp_labels:
+              alertname: Central availability error budget exhaustion - 50%
+              namespace: rhacs-abc
+              rhacs_instance_id: rhacs-abc
+              service: central
+              grpc_service: grpcsvc
+              grpc_method: grpcmeth
+              severity: warning
+            exp_annotations:
+              message: "Latency error budget exhaustion for central's GRPC API. Current exhaustion: 91.77%."
+      - eval_time: 28d
+        alertname: Central latency error budget exhaustion for HTTP API - 90%
+        exp_alerts:
+          - exp_labels:
+              alertname: Central availability error budget exhaustion - 90%
+              namespace: rhacs-abc
+              rhacs_instance_id: rhacs-abc
+              service: central
+              grpc_service: grpcsvc
+              grpc_method: grpcmeth
+              severity: critical
+            exp_annotations:
+              message: "Latency error budget exhaustion for central's HTTP API. Current exhaustion: 91.77%."
+      - eval_time: 28d
+        alertname: Central latency error budget exhaustion for HTTP API - 70%
+        exp_alerts:
+          - exp_labels:
+              alertname: Central availability error budget exhaustion - 70%
+              namespace: rhacs-abc
+              rhacs_instance_id: rhacs-abc
+              service: central
+              grpc_service: grpcsvc
+              grpc_method: grpcmeth
+              severity: warning
+            exp_annotations:
+              message: "Latency error budget exhaustion for central's HTTP API. Current exhaustion: 91.77%."
+      - eval_time: 28d
+        alertname: Central latency error budget exhaustion for HTTP API - 50%
+        exp_alerts:
+          - exp_labels:
+              alertname: Central availability error budget exhaustion - 50%
+              namespace: rhacs-abc
+              rhacs_instance_id: rhacs-abc
+              service: central
+              grpc_service: grpcsvc
+              grpc_method: grpcmeth
+              severity: warning
+            exp_annotations:
+              message: "Latency error budget exhaustion for central's HTTP API. Current exhaustion: 91.77%."
+  - interval: 10m
+    input_series:
+      - series: central:grpc_server_handling_seconds:rate10m:p90{namespace="rhacs-abc", grpc_service="grpc_service", grpc_method="grpc_method"}
+        values: "1+0x2 0+0x2"
+      - series: central:http_incoming_request_duration_histogram_seconds_bucket:rate10m:p90{namespace="rhacs-abc", path="path"}
+        values: "1+0x2 0+0x2"
+    alert_rule_test:
+      - eval_time: 1h
+        alertname: Central latency burn rate for GRPC API
+        exp_alerts:
+          - exp_labels:
+              alertname: Central latency burn rate for GRPC API
+              namespace: rhacs-abc
+              rhacs_instance_id: rhacs-abc
+              service: central
+              grpc_service: grpc_service
+              grpc_method: grpc_method
+              severity: warning
+            exp_annotations:
+              message: "Latency  burn rate for central's GRPC API. Current burn rate per hour: 50."
+      - eval_time: 1h
+        alertname: Central latency burn rate for HTTP API
+        exp_alerts:
+          - exp_labels:
+              alertname: Central latency burn rate for HTTP API
+              namespace: rhacs-abc
+              rhacs_instance_id: rhacs-abc
+              service: central
+              path: path
+              severity: warning
+            exp_annotations:
+              message: "Latency  burn rate for central's HTTP API. Current burn rate per hour: 50."