Skip to content

Commit

Permalink
Adjust test
Browse files Browse the repository at this point in the history
  • Loading branch information
mtodor committed Apr 15, 2024
1 parent 0d142cc commit 3a6fcd3
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 138 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: cluster_autoscaler_cluster_safe_to_autoscale{service="custom-autoscaler"}
values: "0+0x20"
alert_rule_test:
- eval_time: 16m
alertname: ClusterAutoscalerNotSafeToScale
exp_alerts: []

- interval: 1m
input_series:
- series: cluster_autoscaler_cluster_safe_to_autoscale{service="cluster-autoscaler-default"}
values: "1+0x20 0+0x20"
alert_rule_test:
- eval_time: 16m
alertname: ClusterAutoscalerNotSafeToScale
exp_alerts: []
- eval_time: 36m
alertname: ClusterAutoscalerNotSafeToScale
exp_alerts:
- exp_labels:
alertname: ClusterAutoscalerNotSafeToScale
severity: warning
service: cluster-autoscaler-default
exp_annotations:
summary: "Cluster Autoscaler is reporting that the cluster is not ready for scaling."
description: "The cluster autoscaler has detected that the number of unready nodes is too high
and it is not safe to continute scaling operations. It makes this determination by checking that the number of ready nodes is greater than the minimum ready count
(default of 3) and the ratio of unready to ready nodes is less than the maximum unready node percentage (default of 45%). If either of those conditions are not
true then the cluster autoscaler will enter an unsafe to scale state until the conditions change."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: cluster_autoscaler_skipped_scale_events_count{service="custom-autoscaler",direction="up",reason="CpuResourceLimit"}
values: "0+0x10 0+1x30"
- series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="down",reason="CpuResourceLimit"}
values: "0+0x10 0+1x30"
- series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="SomeResourceLimit"}
values: "0+0x10 0+1x30"
alert_rule_test:
- eval_time: 30m
alertname: ClusterAutoscalerUnableToScaleCPULimitReached
exp_alerts: []

- interval: 1m
input_series:
- series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="CpuResourceLimit"}
values: "0+0x10 0+1x30"
alert_rule_test:
- eval_time: 5m
alertname: ClusterAutoscalerUnableToScaleCPULimitReached
exp_alerts: []
- eval_time: 30m
alertname: ClusterAutoscalerUnableToScaleCPULimitReached
exp_alerts:
- exp_labels:
alertname: ClusterAutoscalerUnableToScaleCPULimitReached
severity: info
service: cluster-autoscaler-default
direction: up
reason: CpuResourceLimit
exp_annotations:
summary: "Cluster Autoscaler has reached its maximum CPU core limit and is unable to scale out."
description: "The number of total cores in the cluster has exceeded the maximum number set on the
cluster autoscaler. This is calculated by summing the cpu capacity for all nodes in the cluster and comparing that number against the maximum cores value set for the
cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler configuration."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: cluster_autoscaler_skipped_scale_events_count{service="custom-autoscaler",direction="up",reason="MemoryResourceLimit"}
values: "0+0x10 0+1x30"
- series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="down",reason="MemoryResourceLimit"}
values: "0+0x10 0+1x30"
- series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="SomeResourceLimit"}
values: "0+0x10 0+1x30"
alert_rule_test:
- eval_time: 30m
alertname: ClusterAutoscalerUnableToScaleMemoryLimitReached
exp_alerts: []

- interval: 1m
input_series:
- series: cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="MemoryResourceLimit"}
values: "0+0x10 0+1x30"
alert_rule_test:
- eval_time: 5m
alertname: ClusterAutoscalerUnableToScaleMemoryLimitReached
exp_alerts: []
- eval_time: 30m
alertname: ClusterAutoscalerUnableToScaleMemoryLimitReached
exp_alerts:
- exp_labels:
alertname: ClusterAutoscalerUnableToScaleMemoryLimitReached
severity: info
service: cluster-autoscaler-default
direction: up
reason: MemoryResourceLimit
exp_annotations:
summary: "Cluster Autoscaler has reached its maximum Memory bytes limit and is unable to scale out."
description: "The number of total bytes of RAM in the cluster has exceeded the maximum number set on
the cluster autoscaler. This is calculated by summing the memory capacity for all nodes in the cluster and comparing that number against the maximum memory bytes value set
for the cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler configuration."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: cluster_autoscaler_unschedulable_pods_count{service="custom-autoscaler"}
values: "1+0x40"
alert_rule_test:
- eval_time: 35m
alertname: ClusterAutoscalerUnschedulablePods
exp_alerts: []

- interval: 1m
input_series:
- series: cluster_autoscaler_unschedulable_pods_count{service="cluster-autoscaler-default"}
values: "0+0x20 2+0x50"
alert_rule_test:
- eval_time: 25m
alertname: ClusterAutoscalerUnschedulablePods
exp_alerts: []
- eval_time: 60m
alertname: ClusterAutoscalerUnschedulablePods
exp_alerts:
- exp_labels:
alertname: ClusterAutoscalerUnschedulablePods
severity: critical
service: cluster-autoscaler-default
exp_annotations:
summary: "Cluster Autoscaler has 2 unschedulable pods."
description: "The cluster autoscaler is unable to scale up and is alerting that there are unschedulable pods because of this condition. This may be caused by the cluster autoscaler reaching its resources limits, or by Kubernetes waiting for new nodes to become ready."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"
54 changes: 0 additions & 54 deletions resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml

This file was deleted.

30 changes: 0 additions & 30 deletions resources/prometheus/unit_tests/WorkerNodesMemoryOverCommit.yaml

This file was deleted.

This file was deleted.

0 comments on commit 3a6fcd3

Please sign in to comment.