From 6900ed2de6828460d067154e1dfd962353ee1b74 Mon Sep 17 00:00:00 2001 From: Stephan Hesselmann Date: Wed, 6 Mar 2024 13:24:55 +0100 Subject: [PATCH 1/2] fix: automerge action (#213) * fix: automerge action * limit branches --- .github/workflows/automerge.yaml | 33 +++++++++++++++++++ .github/workflows/sync-prod-from-stage.yaml | 5 ++- .github/workflows/sync-stage-from-master.yaml | 5 ++- 3 files changed, 37 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/automerge.yaml diff --git a/.github/workflows/automerge.yaml b/.github/workflows/automerge.yaml new file mode 100644 index 00000000..5a436c31 --- /dev/null +++ b/.github/workflows/automerge.yaml @@ -0,0 +1,33 @@ +name: automerge +on: + pull_request: + branches: + - stage + - production + types: + - labeled + - unlabeled + - synchronize + - opened + - edited + - ready_for_review + - reopened + - unlocked + pull_request_review: + types: + - submitted + check_suite: + types: + - completed + status: {} +jobs: + automerge: + runs-on: ubuntu-latest + steps: + - id: automerge + name: automerge + uses: "pascalgn/automerge-action@v0.16.2" + env: + GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + MERGE_METHOD: merge + MERGE_FILTER_AUTHOR: github-actions diff --git a/.github/workflows/sync-prod-from-stage.yaml b/.github/workflows/sync-prod-from-stage.yaml index 0497af94..4607927a 100644 --- a/.github/workflows/sync-prod-from-stage.yaml +++ b/.github/workflows/sync-prod-from-stage.yaml @@ -10,10 +10,9 @@ jobs: uses: actions/checkout@v4 - name: Opening pull request id: pull - # We need main for PULL_REQUEST_AUTO_MERGE_METHOD, which is currently not yet released. - uses: tretuna/sync-branches@main + uses: tretuna/sync-branches@1.4.0 with: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} FROM_BRANCH: "stage" TO_BRANCH: "production" - PULL_REQUEST_AUTO_MERGE_METHOD: merge + LABELS: automerge diff --git a/.github/workflows/sync-stage-from-master.yaml b/.github/workflows/sync-stage-from-master.yaml index eb671afe..230cc45d 100644 --- a/.github/workflows/sync-stage-from-master.yaml +++ b/.github/workflows/sync-stage-from-master.yaml @@ -10,10 +10,9 @@ jobs: uses: actions/checkout@v4 - name: Opening pull request id: pull - # We need main for PULL_REQUEST_AUTO_MERGE_METHOD, which is currently not yet released. - uses: tretuna/sync-branches@main + uses: tretuna/sync-branches@1.4.0 with: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} FROM_BRANCH: "master" TO_BRANCH: "stage" - PULL_REQUEST_AUTO_MERGE_METHOD: merge + LABELS: automerge From 59ec468d8509de8e16bf06bf5fdd58d04ae30d96 Mon Sep 17 00:00:00 2001 From: Yury Kovalev <8366110+kovayur@users.noreply.github.com> Date: Thu, 11 Apr 2024 16:17:42 +0200 Subject: [PATCH 2/2] ROX-16615: Remove the probe service from the Data Plane observability resources (#215) --- ...cluster-resource-adjustment-configmap.yaml | 20 ++++----- ...cluster-resource-adjustment-dashboard.yaml | 20 ++++----- .../rhacs-cluster-resource-adjustment.json | 20 ++++----- resources/index.json | 1 - .../pod_monitors/rhacs-probe-metrics.yaml | 36 ---------------- resources/prometheus/prometheus-rules.yaml | 42 ------------------- .../unit_tests/RHACSProbeContainerDown.yaml | 27 ------------ ...ACSProbeContainerFrequentlyRestarting.yaml | 27 ------------ .../unit_tests/RHACSProbeRunFailed.yaml | 31 -------------- .../unit_tests/RHACSProbeScrapeFailed.yaml | 30 ------------- 10 files changed, 30 insertions(+), 224 deletions(-) delete mode 100644 resources/prometheus/pod_monitors/rhacs-probe-metrics.yaml delete mode 100644 resources/prometheus/unit_tests/RHACSProbeContainerDown.yaml delete mode 100644 resources/prometheus/unit_tests/RHACSProbeContainerFrequentlyRestarting.yaml delete mode 100644 resources/prometheus/unit_tests/RHACSProbeRunFailed.yaml delete mode 100644 resources/prometheus/unit_tests/RHACSProbeScrapeFailed.yaml diff --git a/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-configmap.yaml b/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-configmap.yaml index 9be79975..76e8aa92 100644 --- a/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-configmap.yaml +++ b/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-configmap.yaml @@ -4381,7 +4381,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4394,7 +4394,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4408,7 +4408,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4422,7 +4422,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4524,7 +4524,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4537,7 +4537,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4551,7 +4551,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4565,7 +4565,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4680,7 +4680,7 @@ data: }, "editorMode": "builder", "exemplar": false, - "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "__auto", "range": true, @@ -4781,7 +4781,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}[6h])))", + "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}[6h])))", "format": "time_series", "instant": true, "legendFormat": "__auto", diff --git a/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-dashboard.yaml b/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-dashboard.yaml index e25f4384..dae40e59 100644 --- a/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-dashboard.yaml +++ b/resources/grafana/generated/dashboards/rhacs-cluster-resource-adjustment-dashboard.yaml @@ -4381,7 +4381,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4394,7 +4394,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4408,7 +4408,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4422,7 +4422,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4524,7 +4524,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4537,7 +4537,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4551,7 +4551,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4565,7 +4565,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4680,7 +4680,7 @@ spec: }, "editorMode": "builder", "exemplar": false, - "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "__auto", "range": true, @@ -4781,7 +4781,7 @@ spec: }, "editorMode": "code", "exemplar": false, - "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}[6h])))", + "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}[6h])))", "format": "time_series", "instant": true, "legendFormat": "__auto", diff --git a/resources/grafana/sources/rhacs-cluster-resource-adjustment.json b/resources/grafana/sources/rhacs-cluster-resource-adjustment.json index 407c98b6..a7ea08ca 100644 --- a/resources/grafana/sources/rhacs-cluster-resource-adjustment.json +++ b/resources/grafana/sources/rhacs-cluster-resource-adjustment.json @@ -4370,7 +4370,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4383,7 +4383,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(.95, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4397,7 +4397,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.50, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4411,7 +4411,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4513,7 +4513,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "100%ile", "range": true, @@ -4526,7 +4526,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.95, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "95%ile", @@ -4540,7 +4540,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.5, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "50%ile", @@ -4554,7 +4554,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "quantile(0.1, sum by(namespace) (container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "hide": false, "legendFormat": "10%ile", @@ -4669,7 +4669,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}))", + "expr": "topk(5, sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}))", "format": "time_series", "legendFormat": "__auto", "range": true, @@ -4770,7 +4770,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|probe|secured-cluster)\", container!~\"POD|\"}[6h])))", + "expr": "topk(5, sum by(namespace) (avg_over_time(container_memory_working_set_bytes{namespace=~\"rhacs-.*\", namespace!~\"rhacs-(audit-logs|cloudwatch|observability|secured-cluster)\", container!~\"POD|\"}[6h])))", "format": "time_series", "instant": true, "legendFormat": "__auto", diff --git a/resources/index.json b/resources/index.json index b843e500..200262fc 100644 --- a/resources/index.json +++ b/resources/index.json @@ -7,7 +7,6 @@ "prometheus/pod_monitors/rhacs-central-metrics.yaml", "prometheus/pod_monitors/rhacs-cloudwatch-exporter.yaml", "prometheus/pod_monitors/rhacs-fleetshard-sync-metrics.yaml", - "prometheus/pod_monitors/rhacs-probe-metrics.yaml", "prometheus/pod_monitors/rhacs-scanner-metrics.yaml" ], "rules": [ diff --git a/resources/prometheus/pod_monitors/rhacs-probe-metrics.yaml b/resources/prometheus/pod_monitors/rhacs-probe-metrics.yaml deleted file mode 100644 index 1b9fbc6e..00000000 --- a/resources/prometheus/pod_monitors/rhacs-probe-metrics.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: rhacs-probe-metrics - labels: - app: rhacs -spec: - selector: - matchLabels: - app: "probe" - namespaceSelector: - any: true - podMetricsEndpoints: - - path: /metrics - port: monitoring - relabelings: - - action: labeldrop - regex: endpoint - - - sourceLabels: [container] - action: replace - targetLabel: job - - - action: labelmap - regex: __meta_kubernetes_pod_annotation_rhacs_redhat_com_(.+) - replacement: rhacs_${1} - - - action: labelmap - regex: __meta_kubernetes_pod_label_rhacs_redhat_com_(.+) - replacement: rhacs_${1} - - - sourceLabels: [rhacs_tenant] - targetLabel: rhacs_instance_id - - - action: labeldrop - regex: rhacs_tenant diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 1927af74..aa8d82c7 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -321,48 +321,6 @@ spec: description: 'The maximum send rate over the last hour is {{ $value }} messages/second, which is dangerously approaching the maximum limit of 14 per second.' sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-040-aws-ses-violation.md" - - - name: rhacs-probe - rules: - - alert: RHACSProbeRunFailed - expr: acs_probe_last_failure_timestamp > 0 and acs_probe_last_failure_timestamp >= acs_probe_last_success_timestamp - for: 30m - labels: - severity: critical - annotations: - summary: "The latest probe run failed at `{{ $value | humanizeTimestamp }}`." - description: "The latest run of probe `{{ $labels.pod }}` failed at `{{ $value | humanizeTimestamp }}`." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-008-probe-run-failed.md" - - alert: RHACSProbeScrapeFailed - expr: | - avg_over_time(up{job="probe"}[10m]) < 0.5 and ON(pod) kube_pod_container_status_ready{container="probe"} == 1 - for: 20m - labels: - severity: critical - annotations: - summary: "Prometheus unable to scrape metrics from target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}`." - description: "During the last 10 minutes, only `{{ $value | humanizePercentage }}` of scrapes of target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}` were successful. This alert is raised when less than 50% of scrapes are successful." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md" - - alert: RHACSProbeContainerDown - expr: | - avg_over_time(kube_pod_container_status_ready{container="probe"}[10m]) < 0.5 - for: 20m - labels: - severity: critical - annotations: - summary: "Probe container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` is down or in a CrashLoopBackOff status." - description: "Probe container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has been down or in a CrashLoopBackOff status for at least 10 minutes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md" - - alert: RHACSProbeContainerFrequentlyRestarting - expr: | - increase(kube_pod_container_status_restarts_total{container="probe"}[30m]) > 3 - labels: - severity: critical - annotations: - summary: "Probe container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times." - description: "Probe container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md" - - name: deadmanssnitch rules: - alert: DeadMansSwitch diff --git a/resources/prometheus/unit_tests/RHACSProbeContainerDown.yaml b/resources/prometheus/unit_tests/RHACSProbeContainerDown.yaml deleted file mode 100644 index 73f66d4e..00000000 --- a/resources/prometheus/unit_tests/RHACSProbeContainerDown.yaml +++ /dev/null @@ -1,27 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: kube_pod_container_status_ready{namespace="rhacs-1234", pod="probe-1234", container="probe"} - values: "1+0x10 0+0x50" - alert_rule_test: - - eval_time: 15m - alertname: RHACSProbeContainerDown - exp_alerts: [] - - eval_time: 40m - alertname: RHACSProbeContainerDown - exp_alerts: - - exp_labels: - alertname: RHACSProbeContainerDown - pod: probe-1234 - container: probe - namespace: rhacs-1234 - severity: critical - exp_annotations: - summary: "Probe container `probe-1234/probe` in namespace `rhacs-1234` is down or in a CrashLoopBackOff status." - description: "Probe container `probe-1234/probe` in namespace `rhacs-1234` has been down or in a CrashLoopBackOff status for at least 10 minutes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md" diff --git a/resources/prometheus/unit_tests/RHACSProbeContainerFrequentlyRestarting.yaml b/resources/prometheus/unit_tests/RHACSProbeContainerFrequentlyRestarting.yaml deleted file mode 100644 index 21ed7f61..00000000 --- a/resources/prometheus/unit_tests/RHACSProbeContainerFrequentlyRestarting.yaml +++ /dev/null @@ -1,27 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: kube_pod_container_status_restarts_total{namespace="rhacs-1234", pod="probe-1234-5678", container="probe"} - values: "0+0x10 1+1x10 4+1x20" - alert_rule_test: - - eval_time: 10m - alertname: RHACSProbeContainerFrequentlyRestarting - exp_alerts: [] - - eval_time: 30m - alertname: RHACSProbeContainerFrequentlyRestarting - exp_alerts: - - exp_labels: - alertname: RHACSProbeContainerFrequentlyRestarting - container: probe - namespace: rhacs-1234 - pod: probe-1234-5678 - severity: critical - exp_annotations: - summary: "Probe container `probe-1234-5678/probe` in namespace `rhacs-1234` restarted more than 3 times." - description: "Probe container `probe-1234-5678/probe` in namespace `rhacs-1234` has restarted more than 3 times during the last 30 minutes." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md" diff --git a/resources/prometheus/unit_tests/RHACSProbeRunFailed.yaml b/resources/prometheus/unit_tests/RHACSProbeRunFailed.yaml deleted file mode 100644 index 6d2bfb8b..00000000 --- a/resources/prometheus/unit_tests/RHACSProbeRunFailed.yaml +++ /dev/null @@ -1,31 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: acs_probe_last_failure_timestamp{namespace="rhacs-probe", pod="probe-1234"} - values: "0+0x10 0+0x15 7+0x60" - - series: acs_probe_last_success_timestamp{namespace="rhacs-probe", pod="probe-1234"} - values: "0+0x10 1+1x15 6+0x60" - alert_rule_test: - - eval_time: 0m - alertname: RHACSProbeRunFailed - exp_alerts: [] - - eval_time: 30m - alertname: RHACSProbeRunFailed - exp_alerts: [] - - eval_time: 60m - alertname: RHACSProbeRunFailed - exp_alerts: - - exp_labels: - alertname: RHACSProbeRunFailed - severity: critical - namespace: rhacs-probe - pod: probe-1234 - exp_annotations: - summary: "The latest probe run failed at `1970-01-01 00:00:07 +0000 UTC`." - description: "The latest run of probe `probe-1234` failed at `1970-01-01 00:00:07 +0000 UTC`." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-008-probe-run-failed.md" diff --git a/resources/prometheus/unit_tests/RHACSProbeScrapeFailed.yaml b/resources/prometheus/unit_tests/RHACSProbeScrapeFailed.yaml deleted file mode 100644 index 70106a69..00000000 --- a/resources/prometheus/unit_tests/RHACSProbeScrapeFailed.yaml +++ /dev/null @@ -1,30 +0,0 @@ -rule_files: - - /tmp/prometheus-rules-test.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: up{namespace="rhacs-1234", pod="probe-1234-5678", job="probe", instance="1.2.3.4:9090"} - values: "0+0x20 1+0x20" - - series: kube_pod_container_status_ready{namespace="rhacs-1234", pod="probe-1234-5678", container="probe"} - values: "1+0x40" - alert_rule_test: - - eval_time: 10m - alertname: RHACSProbeScrapeFailed - exp_alerts: [] - - eval_time: 25m - alertname: RHACSProbeScrapeFailed - exp_alerts: - - exp_labels: - alertname: RHACSProbeScrapeFailed - instance: 1.2.3.4:9090 - namespace: rhacs-1234 - pod: probe-1234-5678 - severity: critical - job: probe - exp_annotations: - summary: "Prometheus unable to scrape metrics from target `probe-1234-5678` in namespace `rhacs-1234`." - description: "During the last 10 minutes, only `45.45%` of scrapes of target `probe-1234-5678` in namespace `rhacs-1234` were successful. This alert is raised when less than 50% of scrapes are successful." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-009-probe-unavailable.md"