diff --git a/resources/prometheus/unit_tests/ClusterAutoscalerNotSafeToScale.yaml b/resources/prometheus/unit_tests/ClusterAutoscalerNotSafeToScale.yaml new file mode 100644 index 00000000..d84918d6 --- /dev/null +++ b/resources/prometheus/unit_tests/ClusterAutoscalerNotSafeToScale.yaml @@ -0,0 +1,37 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: cluster_autoscaler_cluster_safe_to_autoscale{service="custom-autoscaler"} + values: "0+0x20" + alert_rule_test: + - eval_time: 16m + alertname: ClusterAutoscalerNotSafeToScale + exp_alerts: [] + + - interval: 1m + input_series: + - series: cluster_autoscaler_cluster_safe_to_autoscale{service="cluster-autoscaler-default"} + values: "1+0x20 0+0x20" + alert_rule_test: + - eval_time: 16m + alertname: ClusterAutoscalerNotSafeToScale + exp_alerts: [] + - eval_time: 36m + alertname: ClusterAutoscalerNotSafeToScale + exp_alerts: + - exp_labels: + alertname: ClusterAutoscalerNotSafeToScale + severity: warning + service: cluster-autoscaler-default + exp_annotations: + summary: "Cluster Autoscaler is reporting that the cluster is not ready for scaling." + description: "The cluster autoscaler has detected that the number of unready nodes is too high +and it is not safe to continute scaling operations. It makes this determination by checking that the number of ready nodes is greater than the minimum ready count +(default of 3) and the ratio of unready to ready nodes is less than the maximum unready node percentage (default of 45%). If either of those conditions are not +true then the cluster autoscaler will enter an unsafe to scale state until the conditions change." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md" diff --git a/resources/prometheus/unit_tests/ClusterAutoscalerUnableToScaleCPULimitReached.yaml b/resources/prometheus/unit_tests/ClusterAutoscalerUnableToScaleCPULimitReached.yaml new file mode 100644 index 00000000..0dfea0e8 --- /dev/null +++ b/resources/prometheus/unit_tests/ClusterAutoscalerUnableToScaleCPULimitReached.yaml @@ -0,0 +1,42 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: cluster_autoscaler_skipped_scale_events_count{service="custom-autoscaler",direction="up",reason="CpuResourceLimit"} + values: "0+0x10 0+1x30" + - series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="down",reason="CpuResourceLimit"} + values: "0+0x10 0+1x30" + - series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="SomeResourceLimit"} + values: "0+0x10 0+1x30" + alert_rule_test: + - eval_time: 30m + alertname: ClusterAutoscalerUnableToScaleCPULimitReached + exp_alerts: [] + + - interval: 1m + input_series: + - series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="CpuResourceLimit"} + values: "0+0x10 0+1x30" + alert_rule_test: + - eval_time: 5m + alertname: ClusterAutoscalerUnableToScaleCPULimitReached + exp_alerts: [] + - eval_time: 30m + alertname: ClusterAutoscalerUnableToScaleCPULimitReached + exp_alerts: + - exp_labels: + alertname: ClusterAutoscalerUnableToScaleCPULimitReached + severity: info + service: cluster-autoscaler-default + direction: up + reason: CpuResourceLimit + exp_annotations: + summary: "Cluster Autoscaler has reached its maximum CPU core limit and is unable to scale out." + description: "The number of total cores in the cluster has exceeded the maximum number set on the + cluster autoscaler. This is calculated by summing the cpu capacity for all nodes in the cluster and comparing that number against the maximum cores value set for the + cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler configuration." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md" diff --git a/resources/prometheus/unit_tests/ClusterAutoscalerUnableToScaleMemoryLimitReached.yaml b/resources/prometheus/unit_tests/ClusterAutoscalerUnableToScaleMemoryLimitReached.yaml new file mode 100644 index 00000000..7b128705 --- /dev/null +++ b/resources/prometheus/unit_tests/ClusterAutoscalerUnableToScaleMemoryLimitReached.yaml @@ -0,0 +1,42 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: cluster_autoscaler_skipped_scale_events_count{service="custom-autoscaler",direction="up",reason="MemoryResourceLimit"} + values: "0+0x10 0+1x30" + - series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="down",reason="MemoryResourceLimit"} + values: "0+0x10 0+1x30" + - series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="SomeResourceLimit"} + values: "0+0x10 0+1x30" + alert_rule_test: + - eval_time: 30m + alertname: ClusterAutoscalerUnableToScaleMemoryLimitReached + exp_alerts: [] + + - interval: 1m + input_series: + - series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="MemoryResourceLimit"} + values: "0+0x10 0+1x30" + alert_rule_test: + - eval_time: 5m + alertname: ClusterAutoscalerUnableToScaleMemoryLimitReached + exp_alerts: [] + - eval_time: 30m + alertname: ClusterAutoscalerUnableToScaleMemoryLimitReached + exp_alerts: + - exp_labels: + alertname: ClusterAutoscalerUnableToScaleMemoryLimitReached + severity: info + service: cluster-autoscaler-default + direction: up + reason: MemoryResourceLimit + exp_annotations: + summary: "Cluster Autoscaler has reached its maximum Memory bytes limit and is unable to scale out." + description: "The number of total bytes of RAM in the cluster has exceeded the maximum number set on + the cluster autoscaler. This is calculated by summing the memory capacity for all nodes in the cluster and comparing that number against the maximum memory bytes value set + for the cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler configuration." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md" diff --git a/resources/prometheus/unit_tests/ClusterAutoscalerUnschedulablePods.yaml b/resources/prometheus/unit_tests/ClusterAutoscalerUnschedulablePods.yaml new file mode 100644 index 00000000..02b0631f --- /dev/null +++ b/resources/prometheus/unit_tests/ClusterAutoscalerUnschedulablePods.yaml @@ -0,0 +1,34 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: cluster_autoscaler_unschedulable_pods_count{service="custom-autoscaler"} + values: "1+0x40" + alert_rule_test: + - eval_time: 35m + alertname: ClusterAutoscalerUnschedulablePods + exp_alerts: [] + + - interval: 1m + input_series: + - series: cluster_autoscaler_unschedulable_pods_count{service="cluster-autoscaler-default"} + values: "0+0x20 2+0x50" + alert_rule_test: + - eval_time: 25m + alertname: ClusterAutoscalerUnschedulablePods + exp_alerts: [] + - eval_time: 60m + alertname: ClusterAutoscalerUnschedulablePods + exp_alerts: + - exp_labels: + alertname: ClusterAutoscalerUnschedulablePods + severity: critical + service: cluster-autoscaler-default + exp_annotations: + summary: "Cluster Autoscaler has 2 unschedulable pods." + description: "The cluster autoscaler is unable to scale up and is alerting that there are unschedulable pods because of this condition. This may be caused by the cluster autoscaler reaching its resources limits, or by Kubernetes waiting for new nodes to become ready." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md" diff --git a/resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml b/resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml deleted file mode 100644 index 1a8af897..00000000 --- a/resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml +++ /dev/null @@ -1,54 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: kube_node_role{node="worker-1", role="acscs-worker"} - values: "1" - - series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"} - values: "1" - - series: kube_node_status_allocatable{node="worker-1", resource="cpu", job="kube-state-metrics"} - values: "100" - - series: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{node="worker-1", resource="cpu", job="kube-state-metrics"} - values: "86" - alert_rule_test: - - eval_time: 1m - alertname: WorkerNodesCPUQuotaOverCommitWarning - exp_alerts: [] - - eval_time: 5m - alertname: WorkerNodesCPUQuotaOverCommitWarning - exp_alerts: - - exp_labels: - alertname: WorkerNodesCPUQuotaOverCommitWarning - severity: warning - exp_annotations: - description: "During the last 5 minutes, the average CPU request commitment on worker nodes was 86%. This is above the recommended threshold of 85%." - summary: "There is a risk of over-committing CPU resources on worker nodes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" - - interval: 1m - input_series: - - series: kube_node_role{node="worker-1", role="acscs-worker"} - values: "1" - - series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"} - values: "1" - - series: kube_node_status_allocatable{node="worker-1", resource="cpu", job="kube-state-metrics"} - values: "100" - - series: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{node="worker-1", resource="cpu", job="kube-state-metrics"} - values: "96" - alert_rule_test: - - eval_time: 1m - alertname: WorkerNodesCPUQuotaOverCommit - exp_alerts: [] - - eval_time: 5m - alertname: WorkerNodesCPUQuotaOverCommit - exp_alerts: - - exp_labels: - alertname: WorkerNodesCPUQuotaOverCommit - severity: critical - exp_annotations: - description: "During the last 5 minutes, the average CPU request commitment on worker nodes was 96%. This is above the critical threshold of 95%." - summary: "There is a high risk of over-committing CPU resources on worker nodes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" diff --git a/resources/prometheus/unit_tests/WorkerNodesMemoryOverCommit.yaml b/resources/prometheus/unit_tests/WorkerNodesMemoryOverCommit.yaml deleted file mode 100644 index e22392bc..00000000 --- a/resources/prometheus/unit_tests/WorkerNodesMemoryOverCommit.yaml +++ /dev/null @@ -1,30 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: kube_node_role{node="worker-1", role="acscs-worker"} - values: "1" - - series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"} - values: "1" - - series: kube_node_status_allocatable{node="worker-1", resource="memory", job="kube-state-metrics"} - values: "100" - - series: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{node="worker-1", resource="memory", job="kube-state-metrics"} - values: "201" - alert_rule_test: - - eval_time: 1m - alertname: WorkerNodesMemoryOverCommit - exp_alerts: [] - - eval_time: 5m - alertname: WorkerNodesMemoryOverCommit - exp_alerts: - - exp_labels: - alertname: WorkerNodesMemoryOverCommit - severity: critical - exp_annotations: - description: "During the last 5 minutes, the average Memory limit commitment on worker nodes was 201%. This is above the recommended threshold of 200%." - summary: "There is a high risk of over-committing Memory resources on worker nodes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" diff --git a/resources/prometheus/unit_tests/WorkerNodesMemoryQuotaOverCommit.yaml b/resources/prometheus/unit_tests/WorkerNodesMemoryQuotaOverCommit.yaml deleted file mode 100644 index 86399746..00000000 --- a/resources/prometheus/unit_tests/WorkerNodesMemoryQuotaOverCommit.yaml +++ /dev/null @@ -1,54 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: kube_node_role{node="worker-1", role="acscs-worker"} - values: "1" - - series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"} - values: "1" - - series: kube_node_status_allocatable{node="worker-1", resource="memory", job="kube-state-metrics"} - values: "100" - - series: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{node="worker-1", resource="memory", job="kube-state-metrics"} - values: "86" - alert_rule_test: - - eval_time: 1m - alertname: WorkerNodesMemoryQuotaOverCommitWarning - exp_alerts: [] - - eval_time: 5m - alertname: WorkerNodesMemoryQuotaOverCommitWarning - exp_alerts: - - exp_labels: - alertname: WorkerNodesMemoryQuotaOverCommitWarning - severity: warning - exp_annotations: - description: "During the last 5 minutes, the average memory request commitment on worker nodes was 86%. This is above the recommended threshold of 85%." - summary: "There is a risk of over-committing Memory resources on worker nodes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" - - interval: 1m - input_series: - - series: kube_node_role{node="worker-1", role="acscs-worker"} - values: "1" - - series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"} - values: "1" - - series: kube_node_status_allocatable{node="worker-1", resource="memory", job="kube-state-metrics"} - values: "100" - - series: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{node="worker-1", resource="memory", job="kube-state-metrics"} - values: "96" - alert_rule_test: - - eval_time: 1m - alertname: WorkerNodesMemoryQuotaOverCommit - exp_alerts: [] - - eval_time: 5m - alertname: WorkerNodesMemoryQuotaOverCommit - exp_alerts: - - exp_labels: - alertname: WorkerNodesMemoryQuotaOverCommit - severity: critical - exp_annotations: - description: "During the last 5 minutes, the average memory request commitment on worker nodes was 96%. This is above the critical threshold of 95%." - summary: "There is a high risk of over-committing Memory resources on worker nodes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"