diff --git a/.gitattributes b/.gitattributes index f6eda4ca..d3c9b39a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3,3 +3,4 @@ resources/grafana/generated/** linguist-generated=true resources/grafana/mixins/** linguist-generated=true resources/prometheus/kubernetes-mixin-alerts.yaml linguist-generated=true resources/prometheus/kubernetes-mixin-rules.yaml linguist-generated=true +resources/prometheus/federation-config.yaml linguist-generated=true diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9277938d..3bde3631 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,6 +17,11 @@ jobs: run: | curl -s -S -L -o /tmp/promtool https://github.com/prometheus/prometheus/releases/download/v2.36.0/prometheus-2.36.0.linux-amd64.tar.gz tar -zxf /tmp/promtool --strip-components=1 --directory /usr/local/bin &> /dev/null + - name: Install mimirtool + run: | + curl --silent --show-error --fail --location https://github.com/grafana/mimir/releases/download/mimir-2.10.5/mimirtool-linux-amd64 --output /usr/local/bin/mimirtool + echo "72f46c82c303c48566844612f83ab53d4b804c665644163ad7f0f8945caa0521 /usr/local/bin/mimirtool" | sha256sum --check --status + chmod +x /usr/local/bin/mimirtool - name: Patch PATH to include GOBIN run: echo "PATH=$(echo $PATH):/home/runner/go/bin" >> $GITHUB_ENV - uses: pre-commit/action@v3.0.1 diff --git a/.github/workflows/sync-prod-from-stage.yaml b/.github/workflows/sync-prod-from-stage.yaml index 97a919be..0497af94 100644 --- a/.github/workflows/sync-prod-from-stage.yaml +++ b/.github/workflows/sync-prod-from-stage.yaml @@ -10,8 +10,10 @@ jobs: uses: actions/checkout@v4 - name: Opening pull request id: pull - uses: tretuna/sync-branches@1.4.0 + # We need main for PULL_REQUEST_AUTO_MERGE_METHOD, which is currently not yet released. + uses: tretuna/sync-branches@main with: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} FROM_BRANCH: "stage" TO_BRANCH: "production" + PULL_REQUEST_AUTO_MERGE_METHOD: merge diff --git a/.github/workflows/sync-stage-from-master.yaml b/.github/workflows/sync-stage-from-master.yaml index b2736cc4..eb671afe 100644 --- a/.github/workflows/sync-stage-from-master.yaml +++ b/.github/workflows/sync-stage-from-master.yaml @@ -10,8 +10,10 @@ jobs: uses: actions/checkout@v4 - name: Opening pull request id: pull - uses: tretuna/sync-branches@1.4.0 + # We need main for PULL_REQUEST_AUTO_MERGE_METHOD, which is currently not yet released. + uses: tretuna/sync-branches@main with: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} FROM_BRANCH: "master" TO_BRANCH: "stage" + PULL_REQUEST_AUTO_MERGE_METHOD: merge diff --git a/Makefile b/Makefile index 4789bae4..b5020a16 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,15 @@ -.PHONY: generate -generate: +.PHONY: generate-resources +generate-resources: $(MAKE) -C resources/mixins/kubernetes generate $(MAKE) -C resources/grafana generate +.PHONY: generate-federate +generate-federate: + @scripts/generate-federate-match.sh + +.PHONY: generate +generate: generate-resources generate-federate + .PHONY: update update: $(MAKE) -C resources/mixins/kubernetes update diff --git a/README.md b/README.md index aec9924a..bd323955 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,13 @@ GitHub actions to trigger branch synchronization. ## Contributing +### Prerequisites + +The following tools are required for development: +- `jq` - please follow [the installation instructions](https://jqlang.github.io/jq/download). +- `mimirtool` - please follow [the installation instructions](https://grafana.com/docs/mimir/latest/manage/tools/mimirtool/#installation). +- `yq` - please follow [the installation instructions](https://github.com/mikefarah/yq/#install). + ### Dashboards To make changes to the rhacs dashboards: @@ -48,6 +55,15 @@ Then: * Update `resources/mixins/kubernetes/mixin.libsonnet`. * Run `make generate` to generate the corresponding mixin resources. +### Federated metrics + +If you make changes to Alerts, Recording rules, or Grafana dashboards, and if they include metrics collected by OSD Prometheus, ensure that the federation config includes new metrics. + +* Run `make generate-federate` to update federation config. +* And commit changes in `resources/prometheus/federation-config.yaml` file to the repo. + +You can add additional federated metrics that are not used in any Alert, Recording rule, or Grafana dashboard to `resources/prometheus/federation-config-base.yaml,` and they will be merged with other metrics. Always add a comment with the reason why metrics are added to the base list. + ### Pre-commit hook This repository makes use of [pre-commit](https://pre-commit.com/) framework. Refer to the [installation instructions](https://pre-commit.com/#installation) for further information. diff --git a/resources/grafana/generated/dashboards/rhacs-cluster-overview-configmap.yaml b/resources/grafana/generated/dashboards/rhacs-cluster-overview-configmap.yaml index abe1753e..71e0865e 100644 --- a/resources/grafana/generated/dashboards/rhacs-cluster-overview-configmap.yaml +++ b/resources/grafana/generated/dashboards/rhacs-cluster-overview-configmap.yaml @@ -1386,7 +1386,39 @@ data: { "matcher": { "id": "byName", - "options": "Value #Memory consumption" + "options": "ScannerV2 memory" + }, + "properties": [ + { + "id": "unit", + "value": "decbytes" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "noValue", + "value": "-" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ScannerV2 CPU" }, "properties": [ { @@ -1400,10 +1432,6 @@ data: "type": "gauge" } }, - { - "id": "max", - "value": 1 - }, { "id": "thresholds", "value": { @@ -1414,8 +1442,8 @@ data: "value": null }, { - "color": "orange", - "value": 70 + "color": "#EAB839", + "value": 60 }, { "color": "red", @@ -1423,47 +1451,55 @@ data: } ] } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Network received" - }, - "properties": [ + }, { - "id": "unit", - "value": "Bps" + "id": "decimals", + "value": 1 } ] }, { "matcher": { "id": "byName", - "options": "Network transmitted" + "options": "Matcher memory" }, "properties": [ { "id": "unit", - "value": "Bps" + "value": "decbytes" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "noValue", + "value": "-" } ] }, { "matcher": { "id": "byName", - "options": "CPU consumption" + "options": "Matcher CPU" }, "properties": [ { "id": "unit", "value": "percentunit" }, - { - "id": "max", - "value": 1 - }, { "id": "custom.cellOptions", "value": { @@ -1481,35 +1517,63 @@ data: "value": null }, { - "color": "orange", + "color": "yellow", "value": 60 }, { "color": "red", - "value": 80 + "value": 90 } ] } }, { "id": "decimals", - "value": 2 + "value": 1 } ] }, { "matcher": { "id": "byName", - "options": "CPU throttle" + "options": "Indexer memory" }, "properties": [ { "id": "unit", - "value": "percentunit" + "value": "decbytes" }, { - "id": "max", - "value": 1 + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "noValue", + "value": "-" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Indexer CPU" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" }, { "id": "custom.cellOptions", @@ -1528,12 +1592,12 @@ data: "value": null }, { - "color": "orange", - "value": 10 + "color": "#EAB839", + "value": 60 }, { "color": "red", - "value": 50 + "value": 90 } ] } @@ -1543,6 +1607,30 @@ data: "value": 1 } ] + }, + { + "matcher": { + "id": "byName", + "options": "Organization" + }, + "properties": [ + { + "id": "noValue", + "value": "DELETED" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Version" + }, + "properties": [ + { + "id": "noValue", + "value": "DELETED" + } + ] } ] }, @@ -1579,12 +1667,12 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", container=\"scanner\", job=~\"kubelet\"}) by (namespace) / sum(container_spec_memory_limit_bytes{namespace=~\"rhacs-$instance_id\", container=\"scanner\", job=~\"kubelet\"}) by (namespace)", + "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"scanner\"}) by (namespace)", "format": "table", "instant": true, "legendFormat": "__auto", "range": false, - "refId": "Memory consumption" + "refId": "ScannerV2 memory" }, { "datasource": { @@ -1593,13 +1681,28 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"rhacs-$instance_id\", job=~\"kubelet\", pod=~\"scanner-.*\"}[5m])) by (namespace)", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"scanner\"}[$__range])) by (namespace)", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, - "refId": "Network received" + "refId": "ScannerV2 CPU" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"matcher\"}) by (namespace)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Matcher memory" }, { "datasource": { @@ -1608,13 +1711,13 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"rhacs-$instance_id\", job=~\"kubelet\", pod=~\"scanner-.*\"}[5m])) by (namespace)", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"matcher\"}[$__range])) by (namespace)", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, - "refId": "Network Transmitted" + "refId": "Matcher CPU" }, { "datasource": { @@ -1623,13 +1726,43 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(process_cpu_seconds_total{namespace=~\"rhacs-$instance_id\", job=~\"scanner\"}) by (namespace, rhacs_org_name, rhacs_org_id)", + "expr": "sum(process_cpu_seconds_total{namespace=~\"rhacs-$instance_id\", job=~\"scanner\"}) by (namespace, rhacs_org_name, rhacs_version)", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, "refId": "Organization" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"indexer\"}) by (namespace)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Indexer memory" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"indexer\"}[$__range])) by (namespace)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Indexer CPU" } ], "title": "Scanner Overview Table", @@ -1638,7 +1771,7 @@ data: "id": "seriesToColumns", "options": { "byField": "namespace", - "mode": "inner" + "mode": "outer" } }, { @@ -1657,33 +1790,47 @@ data: "Value #Organization": true }, "indexByName": { - "Time": 6, - "Time 2": 7, - "Time 3": 8, - "Time 4": 9, - "Value #Memory consumption": 1, - "Value #Network Transmitted": 3, - "Value #Network received": 2, - "Value #Organization": 10, + "Time 1": 13, + "Time 2": 9, + "Time 3": 10, + "Time 4": 11, + "Time 5": 14, + "Time 6": 15, + "Time 7": 16, + "Value #Indexer CPU": 6, + "Value #Indexer memory": 5, + "Value #Matcher CPU": 4, + "Value #Matcher memory": 3, + "Value #Organization": 12, + "Value #ScannerV2 CPU": 2, + "Value #ScannerV2 memory": 1, "namespace": 0, - "rhacs_org_id": 5, - "rhacs_org_name": 4 + "rhacs_org_name": 7, + "rhacs_version": 8 }, "renameByName": { "Time": "", + "Time 7": "", "Value": "CPU consumption", "Value #A": "", "Value #CPU Throttle": "CPU throttle", "Value #CPU consumption": "CPU consumption", "Value #CPU throttle": "CPU throttle", + "Value #Indexer CPU": "Indexer CPU", + "Value #Indexer memory": "Indexer memory", + "Value #Matcher CPU": "Matcher CPU", + "Value #Matcher memory": "Matcher memory", "Value #Memory consumption": "Memory consumption", "Value #Network Transmitted": "Network transmitted", "Value #Network received": "Network received", "Value #Organisation": "", "Value #Organization": "", + "Value #ScannerV2 CPU": "ScannerV2 CPU", + "Value #ScannerV2 memory": "ScannerV2 memory", "namespace": "Namespace", "rhacs_org_id": "Organization ID", - "rhacs_org_name": "Organization" + "rhacs_org_name": "Organization", + "rhacs_version": "Version" } } } diff --git a/resources/grafana/generated/dashboards/rhacs-cluster-overview-dashboard.yaml b/resources/grafana/generated/dashboards/rhacs-cluster-overview-dashboard.yaml index cdd50710..b11775cc 100644 --- a/resources/grafana/generated/dashboards/rhacs-cluster-overview-dashboard.yaml +++ b/resources/grafana/generated/dashboards/rhacs-cluster-overview-dashboard.yaml @@ -1386,7 +1386,39 @@ spec: { "matcher": { "id": "byName", - "options": "Value #Memory consumption" + "options": "ScannerV2 memory" + }, + "properties": [ + { + "id": "unit", + "value": "decbytes" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "noValue", + "value": "-" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ScannerV2 CPU" }, "properties": [ { @@ -1400,10 +1432,6 @@ spec: "type": "gauge" } }, - { - "id": "max", - "value": 1 - }, { "id": "thresholds", "value": { @@ -1414,8 +1442,8 @@ spec: "value": null }, { - "color": "orange", - "value": 70 + "color": "#EAB839", + "value": 60 }, { "color": "red", @@ -1423,47 +1451,55 @@ spec: } ] } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Network received" - }, - "properties": [ + }, { - "id": "unit", - "value": "Bps" + "id": "decimals", + "value": 1 } ] }, { "matcher": { "id": "byName", - "options": "Network transmitted" + "options": "Matcher memory" }, "properties": [ { "id": "unit", - "value": "Bps" + "value": "decbytes" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "noValue", + "value": "-" } ] }, { "matcher": { "id": "byName", - "options": "CPU consumption" + "options": "Matcher CPU" }, "properties": [ { "id": "unit", "value": "percentunit" }, - { - "id": "max", - "value": 1 - }, { "id": "custom.cellOptions", "value": { @@ -1481,35 +1517,63 @@ spec: "value": null }, { - "color": "orange", + "color": "yellow", "value": 60 }, { "color": "red", - "value": 80 + "value": 90 } ] } }, { "id": "decimals", - "value": 2 + "value": 1 } ] }, { "matcher": { "id": "byName", - "options": "CPU throttle" + "options": "Indexer memory" }, "properties": [ { "id": "unit", - "value": "percentunit" + "value": "decbytes" }, { - "id": "max", - "value": 1 + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "noValue", + "value": "-" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Indexer CPU" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" }, { "id": "custom.cellOptions", @@ -1528,12 +1592,12 @@ spec: "value": null }, { - "color": "orange", - "value": 10 + "color": "#EAB839", + "value": 60 }, { "color": "red", - "value": 50 + "value": 90 } ] } @@ -1543,6 +1607,30 @@ spec: "value": 1 } ] + }, + { + "matcher": { + "id": "byName", + "options": "Organization" + }, + "properties": [ + { + "id": "noValue", + "value": "DELETED" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Version" + }, + "properties": [ + { + "id": "noValue", + "value": "DELETED" + } + ] } ] }, @@ -1579,12 +1667,12 @@ spec: }, "editorMode": "code", "exemplar": false, - "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", container=\"scanner\", job=~\"kubelet\"}) by (namespace) / sum(container_spec_memory_limit_bytes{namespace=~\"rhacs-$instance_id\", container=\"scanner\", job=~\"kubelet\"}) by (namespace)", + "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"scanner\"}) by (namespace)", "format": "table", "instant": true, "legendFormat": "__auto", "range": false, - "refId": "Memory consumption" + "refId": "ScannerV2 memory" }, { "datasource": { @@ -1593,13 +1681,28 @@ spec: }, "editorMode": "code", "exemplar": false, - "expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"rhacs-$instance_id\", job=~\"kubelet\", pod=~\"scanner-.*\"}[5m])) by (namespace)", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"scanner\"}[$__range])) by (namespace)", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, - "refId": "Network received" + "refId": "ScannerV2 CPU" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"matcher\"}) by (namespace)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Matcher memory" }, { "datasource": { @@ -1608,13 +1711,13 @@ spec: }, "editorMode": "code", "exemplar": false, - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"rhacs-$instance_id\", job=~\"kubelet\", pod=~\"scanner-.*\"}[5m])) by (namespace)", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"matcher\"}[$__range])) by (namespace)", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, - "refId": "Network Transmitted" + "refId": "Matcher CPU" }, { "datasource": { @@ -1623,13 +1726,43 @@ spec: }, "editorMode": "code", "exemplar": false, - "expr": "sum(process_cpu_seconds_total{namespace=~\"rhacs-$instance_id\", job=~\"scanner\"}) by (namespace, rhacs_org_name, rhacs_org_id)", + "expr": "sum(process_cpu_seconds_total{namespace=~\"rhacs-$instance_id\", job=~\"scanner\"}) by (namespace, rhacs_org_name, rhacs_version)", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, "refId": "Organization" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"indexer\"}) by (namespace)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Indexer memory" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"indexer\"}[$__range])) by (namespace)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Indexer CPU" } ], "title": "Scanner Overview Table", @@ -1638,7 +1771,7 @@ spec: "id": "seriesToColumns", "options": { "byField": "namespace", - "mode": "inner" + "mode": "outer" } }, { @@ -1657,33 +1790,47 @@ spec: "Value #Organization": true }, "indexByName": { - "Time": 6, - "Time 2": 7, - "Time 3": 8, - "Time 4": 9, - "Value #Memory consumption": 1, - "Value #Network Transmitted": 3, - "Value #Network received": 2, - "Value #Organization": 10, + "Time 1": 13, + "Time 2": 9, + "Time 3": 10, + "Time 4": 11, + "Time 5": 14, + "Time 6": 15, + "Time 7": 16, + "Value #Indexer CPU": 6, + "Value #Indexer memory": 5, + "Value #Matcher CPU": 4, + "Value #Matcher memory": 3, + "Value #Organization": 12, + "Value #ScannerV2 CPU": 2, + "Value #ScannerV2 memory": 1, "namespace": 0, - "rhacs_org_id": 5, - "rhacs_org_name": 4 + "rhacs_org_name": 7, + "rhacs_version": 8 }, "renameByName": { "Time": "", + "Time 7": "", "Value": "CPU consumption", "Value #A": "", "Value #CPU Throttle": "CPU throttle", "Value #CPU consumption": "CPU consumption", "Value #CPU throttle": "CPU throttle", + "Value #Indexer CPU": "Indexer CPU", + "Value #Indexer memory": "Indexer memory", + "Value #Matcher CPU": "Matcher CPU", + "Value #Matcher memory": "Matcher memory", "Value #Memory consumption": "Memory consumption", "Value #Network Transmitted": "Network transmitted", "Value #Network received": "Network received", "Value #Organisation": "", "Value #Organization": "", + "Value #ScannerV2 CPU": "ScannerV2 CPU", + "Value #ScannerV2 memory": "ScannerV2 memory", "namespace": "Namespace", "rhacs_org_id": "Organization ID", - "rhacs_org_name": "Organization" + "rhacs_org_name": "Organization", + "rhacs_version": "Version" } } } diff --git a/resources/grafana/sources/rhacs-cluster-overview.json b/resources/grafana/sources/rhacs-cluster-overview.json index a18a2ab6..d9179cbd 100644 --- a/resources/grafana/sources/rhacs-cluster-overview.json +++ b/resources/grafana/sources/rhacs-cluster-overview.json @@ -1375,7 +1375,39 @@ { "matcher": { "id": "byName", - "options": "Value #Memory consumption" + "options": "ScannerV2 memory" + }, + "properties": [ + { + "id": "unit", + "value": "decbytes" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "noValue", + "value": "-" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ScannerV2 CPU" }, "properties": [ { @@ -1389,10 +1421,6 @@ "type": "gauge" } }, - { - "id": "max", - "value": 1 - }, { "id": "thresholds", "value": { @@ -1403,8 +1431,8 @@ "value": null }, { - "color": "orange", - "value": 70 + "color": "#EAB839", + "value": 60 }, { "color": "red", @@ -1412,47 +1440,55 @@ } ] } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Network received" - }, - "properties": [ + }, { - "id": "unit", - "value": "Bps" + "id": "decimals", + "value": 1 } ] }, { "matcher": { "id": "byName", - "options": "Network transmitted" + "options": "Matcher memory" }, "properties": [ { "id": "unit", - "value": "Bps" + "value": "decbytes" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "noValue", + "value": "-" } ] }, { "matcher": { "id": "byName", - "options": "CPU consumption" + "options": "Matcher CPU" }, "properties": [ { "id": "unit", "value": "percentunit" }, - { - "id": "max", - "value": 1 - }, { "id": "custom.cellOptions", "value": { @@ -1470,35 +1506,63 @@ "value": null }, { - "color": "orange", + "color": "yellow", "value": 60 }, { "color": "red", - "value": 80 + "value": 90 } ] } }, { "id": "decimals", - "value": 2 + "value": 1 } ] }, { "matcher": { "id": "byName", - "options": "CPU throttle" + "options": "Indexer memory" }, "properties": [ { "id": "unit", - "value": "percentunit" + "value": "decbytes" }, { - "id": "max", - "value": 1 + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "noValue", + "value": "-" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Indexer CPU" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" }, { "id": "custom.cellOptions", @@ -1517,12 +1581,12 @@ "value": null }, { - "color": "orange", - "value": 10 + "color": "#EAB839", + "value": 60 }, { "color": "red", - "value": 50 + "value": 90 } ] } @@ -1532,6 +1596,30 @@ "value": 1 } ] + }, + { + "matcher": { + "id": "byName", + "options": "Organization" + }, + "properties": [ + { + "id": "noValue", + "value": "DELETED" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Version" + }, + "properties": [ + { + "id": "noValue", + "value": "DELETED" + } + ] } ] }, @@ -1568,12 +1656,12 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", container=\"scanner\", job=~\"kubelet\"}) by (namespace) / sum(container_spec_memory_limit_bytes{namespace=~\"rhacs-$instance_id\", container=\"scanner\", job=~\"kubelet\"}) by (namespace)", + "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"scanner\"}) by (namespace)", "format": "table", "instant": true, "legendFormat": "__auto", "range": false, - "refId": "Memory consumption" + "refId": "ScannerV2 memory" }, { "datasource": { @@ -1582,13 +1670,28 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"rhacs-$instance_id\", job=~\"kubelet\", pod=~\"scanner-.*\"}[5m])) by (namespace)", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"scanner\"}[$__range])) by (namespace)", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, - "refId": "Network received" + "refId": "ScannerV2 CPU" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"matcher\"}) by (namespace)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Matcher memory" }, { "datasource": { @@ -1597,13 +1700,13 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"rhacs-$instance_id\", job=~\"kubelet\", pod=~\"scanner-.*\"}[5m])) by (namespace)", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"matcher\"}[$__range])) by (namespace)", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, - "refId": "Network Transmitted" + "refId": "Matcher CPU" }, { "datasource": { @@ -1612,13 +1715,43 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(process_cpu_seconds_total{namespace=~\"rhacs-$instance_id\", job=~\"scanner\"}) by (namespace, rhacs_org_name, rhacs_org_id)", + "expr": "sum(process_cpu_seconds_total{namespace=~\"rhacs-$instance_id\", job=~\"scanner\"}) by (namespace, rhacs_org_name, rhacs_version)", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, "refId": "Organization" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"indexer\"}) by (namespace)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Indexer memory" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"rhacs-$instance_id\", job=\"kubelet\", container=\"indexer\"}[$__range])) by (namespace)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Indexer CPU" } ], "title": "Scanner Overview Table", @@ -1627,7 +1760,7 @@ "id": "seriesToColumns", "options": { "byField": "namespace", - "mode": "inner" + "mode": "outer" } }, { @@ -1646,33 +1779,47 @@ "Value #Organization": true }, "indexByName": { - "Time": 6, - "Time 2": 7, - "Time 3": 8, - "Time 4": 9, - "Value #Memory consumption": 1, - "Value #Network Transmitted": 3, - "Value #Network received": 2, - "Value #Organization": 10, + "Time 1": 13, + "Time 2": 9, + "Time 3": 10, + "Time 4": 11, + "Time 5": 14, + "Time 6": 15, + "Time 7": 16, + "Value #Indexer CPU": 6, + "Value #Indexer memory": 5, + "Value #Matcher CPU": 4, + "Value #Matcher memory": 3, + "Value #Organization": 12, + "Value #ScannerV2 CPU": 2, + "Value #ScannerV2 memory": 1, "namespace": 0, - "rhacs_org_id": 5, - "rhacs_org_name": 4 + "rhacs_org_name": 7, + "rhacs_version": 8 }, "renameByName": { "Time": "", + "Time 7": "", "Value": "CPU consumption", "Value #A": "", "Value #CPU Throttle": "CPU throttle", "Value #CPU consumption": "CPU consumption", "Value #CPU throttle": "CPU throttle", + "Value #Indexer CPU": "Indexer CPU", + "Value #Indexer memory": "Indexer memory", + "Value #Matcher CPU": "Matcher CPU", + "Value #Matcher memory": "Matcher memory", "Value #Memory consumption": "Memory consumption", "Value #Network Transmitted": "Network transmitted", "Value #Network received": "Network received", "Value #Organisation": "", "Value #Organization": "", + "Value #ScannerV2 CPU": "ScannerV2 CPU", + "Value #ScannerV2 memory": "ScannerV2 memory", "namespace": "Namespace", "rhacs_org_id": "Organization ID", - "rhacs_org_name": "Organization" + "rhacs_org_name": "Organization", + "rhacs_version": "Version" } } } diff --git a/resources/prometheus/federation-config-base.yaml b/resources/prometheus/federation-config-base.yaml new file mode 100644 index 00000000..33a70c2f --- /dev/null +++ b/resources/prometheus/federation-config-base.yaml @@ -0,0 +1,5 @@ +# Use this list to add any required federated metrics that are not used by Rules, Alerts, or Grafana dashboards. +# Listed metrics will be merged with metrics used in Rules, Alerts, and Grafana dashboards. +match[]: + # This is an example. + - up{job!~"central|scanner"} diff --git a/resources/prometheus/federation-config.yaml b/resources/prometheus/federation-config.yaml index 4bb785f1..d7a7a558 100644 --- a/resources/prometheus/federation-config.yaml +++ b/resources/prometheus/federation-config.yaml @@ -1,4 +1,169 @@ match[]: - # Federate all platform metrics except ACS metrics. We want all base metrics for the Kubernetes mixin. - - '{__name__=~".*", job!="central"}' - - '{__name__=~".*", job="central", endpoint!="monitoring-tls"}' + - :node_memory_MemAvailable_bytes:sum{job!~"central|scanner"} + - aggregator_unavailable_apiservice_total{job!~"central|scanner"} + - aggregator_unavailable_apiservice{job!~"central|scanner"} + - apiserver_request:availability30d{job!~"central|scanner"} + - apiserver_request:burnrate1d{job!~"central|scanner"} + - apiserver_request:burnrate1h{job!~"central|scanner"} + - apiserver_request:burnrate2h{job!~"central|scanner"} + - apiserver_request:burnrate30m{job!~"central|scanner"} + - apiserver_request:burnrate3d{job!~"central|scanner"} + - apiserver_request:burnrate5m{job!~"central|scanner"} + - apiserver_request:burnrate6h{job!~"central|scanner"} + - apiserver_request_terminations_total{job!~"central|scanner"} + - apiserver_request_total{job!~"central|scanner"} + - cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{job!~"central|scanner"} + - cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{job!~"central|scanner"} + - cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{job!~"central|scanner"} + - cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{job!~"central|scanner"} + - cluster:node_cpu:ratio_rate5m{job!~"central|scanner"} + - cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile{job!~"central|scanner"} + - code_resource:apiserver_request_total:rate5m{job!~"central|scanner"} + - container_cpu_cfs_periods_total{job!~"central|scanner"} + - container_cpu_cfs_throttled_periods_total{job!~"central|scanner"} + - container_cpu_usage_seconds_total{job!~"central|scanner"} + - container_fs_reads_bytes_total{job!~"central|scanner"} + - container_fs_reads_total{job!~"central|scanner"} + - container_fs_writes_bytes_total{job!~"central|scanner"} + - container_fs_writes_total{job!~"central|scanner"} + - container_memory_cache{job!~"central|scanner"} + - container_memory_max_usage_bytes{job!~"central|scanner"} + - container_memory_rss{job!~"central|scanner"} + - container_memory_swap{job!~"central|scanner"} + - container_memory_working_set_bytes{job!~"central|scanner"} + - container_network_receive_bytes_total{job!~"central|scanner"} + - container_network_receive_packets_dropped_total{job!~"central|scanner"} + - container_network_receive_packets_total{job!~"central|scanner"} + - container_network_transmit_bytes_total{job!~"central|scanner"} + - container_network_transmit_packets_dropped_total{job!~"central|scanner"} + - container_network_transmit_packets_total{job!~"central|scanner"} + - container_spec_memory_limit_bytes{job!~"central|scanner"} + - go_gc_duration_seconds{job!~"central|scanner"} + - go_goroutines{job!~"central|scanner"} + - go_memstats_alloc_bytes{job!~"central|scanner"} + - go_memstats_heap_inuse_bytes{job!~"central|scanner"} + - go_memstats_stack_inuse_bytes{job!~"central|scanner"} + - grpc_server_handled_total{job!~"central|scanner"} + - grpc_server_handling_seconds_bucket{job!~"central|scanner"} + - grpc_server_started_total{job!~"central|scanner"} + - haproxy_backend_http_responses_total{job!~"central|scanner"} + - http_incoming_request_duration_histogram_seconds_bucket{job!~"central|scanner"} + - http_incoming_requests_total{job!~"central|scanner"} + - kube_daemonset_status_current_number_scheduled{job!~"central|scanner"} + - kube_daemonset_status_desired_number_scheduled{job!~"central|scanner"} + - kube_daemonset_status_number_available{job!~"central|scanner"} + - kube_daemonset_status_number_misscheduled{job!~"central|scanner"} + - kube_daemonset_status_updated_number_scheduled{job!~"central|scanner"} + - kube_deployment_metadata_generation{job!~"central|scanner"} + - kube_deployment_spec_replicas{job!~"central|scanner"} + - kube_deployment_status_condition{job!~"central|scanner"} + - kube_deployment_status_observed_generation{job!~"central|scanner"} + - kube_deployment_status_replicas_available{job!~"central|scanner"} + - kube_deployment_status_replicas_ready{job!~"central|scanner"} + - kube_deployment_status_replicas_updated{job!~"central|scanner"} + - kube_horizontalpodautoscaler_spec_max_replicas{job!~"central|scanner"} + - kube_horizontalpodautoscaler_spec_min_replicas{job!~"central|scanner"} + - kube_horizontalpodautoscaler_status_current_replicas{job!~"central|scanner"} + - kube_horizontalpodautoscaler_status_desired_replicas{job!~"central|scanner"} + - kube_job_failed{job!~"central|scanner"} + - kube_job_status_active{job!~"central|scanner"} + - kube_job_status_start_time{job!~"central|scanner"} + - kube_namespace_status_phase{job!~"central|scanner"} + - kube_node_info{job!~"central|scanner"} + - kube_node_labels{job!~"central|scanner"} + - kube_node_role{job!~"central|scanner"} + - kube_node_spec_taint{job!~"central|scanner"} + - kube_node_status_allocatable{job!~"central|scanner"} + - kube_node_status_capacity{job!~"central|scanner"} + - kube_node_status_condition{job!~"central|scanner"} + - kube_persistentvolume_status_phase{job!~"central|scanner"} + - kube_persistentvolumeclaim_access_mode{job!~"central|scanner"} + - kube_persistentvolumeclaim_labels{job!~"central|scanner"} + - kube_pod_container_resource_limits{job!~"central|scanner"} + - kube_pod_container_resource_requests{job!~"central|scanner"} + - kube_pod_container_status_ready{job!~"central|scanner"} + - kube_pod_container_status_restarts_total{job!~"central|scanner"} + - kube_pod_container_status_waiting_reason{job!~"central|scanner"} + - kube_pod_info{job!~"central|scanner"} + - kube_pod_labels{job!~"central|scanner"} + - kube_pod_owner{job!~"central|scanner"} + - kube_pod_status_phase{job!~"central|scanner"} + - kube_resourcequota{job!~"central|scanner"} + - kube_statefulset_metadata_generation{job!~"central|scanner"} + - kube_statefulset_replicas{job!~"central|scanner"} + - kube_statefulset_status_current_revision{job!~"central|scanner"} + - kube_statefulset_status_observed_generation{job!~"central|scanner"} + - kube_statefulset_status_replicas_ready{job!~"central|scanner"} + - kube_statefulset_status_replicas_updated{job!~"central|scanner"} + - kube_statefulset_status_replicas{job!~"central|scanner"} + - kube_statefulset_status_update_revision{job!~"central|scanner"} + - kubelet_certificate_manager_client_expiration_renew_errors{job!~"central|scanner"} + - kubelet_cgroup_manager_duration_seconds_bucket{job!~"central|scanner"} + - kubelet_cgroup_manager_duration_seconds_count{job!~"central|scanner"} + - kubelet_node_config_error{job!~"central|scanner"} + - kubelet_node_name{job!~"central|scanner"} + - kubelet_pleg_relist_duration_seconds_bucket{job!~"central|scanner"} + - kubelet_pleg_relist_duration_seconds_count{job!~"central|scanner"} + - kubelet_pleg_relist_interval_seconds_bucket{job!~"central|scanner"} + - kubelet_pod_start_duration_seconds_bucket{job!~"central|scanner"} + - kubelet_pod_start_duration_seconds_count{job!~"central|scanner"} + - kubelet_pod_worker_duration_seconds_bucket{job!~"central|scanner"} + - kubelet_pod_worker_duration_seconds_count{job!~"central|scanner"} + - kubelet_running_container_count{job!~"central|scanner"} + - kubelet_running_containers{job!~"central|scanner"} + - kubelet_running_pod_count{job!~"central|scanner"} + - kubelet_running_pods{job!~"central|scanner"} + - kubelet_runtime_operations_duration_seconds_bucket{job!~"central|scanner"} + - kubelet_runtime_operations_errors_total{job!~"central|scanner"} + - kubelet_runtime_operations_total{job!~"central|scanner"} + - kubelet_server_expiration_renew_errors{job!~"central|scanner"} + - kubelet_volume_stats_available_bytes{job!~"central|scanner"} + - kubelet_volume_stats_capacity_bytes{job!~"central|scanner"} + - kubelet_volume_stats_inodes_free{job!~"central|scanner"} + - kubelet_volume_stats_inodes_used{job!~"central|scanner"} + - kubelet_volume_stats_inodes{job!~"central|scanner"} + - kubelet_volume_stats_used_bytes{job!~"central|scanner"} + - kubeproxy_network_programming_duration_seconds_bucket{job!~"central|scanner"} + - kubeproxy_network_programming_duration_seconds_count{job!~"central|scanner"} + - kubeproxy_sync_proxy_rules_duration_seconds_bucket{job!~"central|scanner"} + - kubeproxy_sync_proxy_rules_duration_seconds_count{job!~"central|scanner"} + - kubernetes_build_info{job!~"central|scanner"} + - namespace_cpu:kube_pod_container_resource_limits:sum{job!~"central|scanner"} + - namespace_cpu:kube_pod_container_resource_requests:sum{job!~"central|scanner"} + - namespace_memory:kube_pod_container_resource_limits:sum{job!~"central|scanner"} + - namespace_memory:kube_pod_container_resource_requests:sum{job!~"central|scanner"} + - namespace_workload_pod:kube_pod_owner:relabel{job!~"central|scanner"} + - node_memory_MemTotal_bytes{job!~"central|scanner"} + - node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{job!~"central|scanner"} + - node_namespace_pod_container:container_memory_cache{job!~"central|scanner"} + - node_namespace_pod_container:container_memory_rss{job!~"central|scanner"} + - node_namespace_pod_container:container_memory_swap{job!~"central|scanner"} + - node_namespace_pod_container:container_memory_working_set_bytes{job!~"central|scanner"} + - node_netstat_TcpExt_TCPSynRetrans{job!~"central|scanner"} + - node_netstat_Tcp_OutSegs{job!~"central|scanner"} + - node_netstat_Tcp_RetransSegs{job!~"central|scanner"} + - node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{job!~"central|scanner"} + - obs_operator:prometheus_remote_storage_succeeded_samples:ratio_rate1h{job!~"central|scanner"} + - process_cpu_seconds_total{job!~"central|scanner"} + - process_resident_memory_bytes{job!~"central|scanner"} + - prometheus_remote_storage_samples_failed_total{job!~"central|scanner"} + - prometheus_remote_storage_samples_retried_total{job!~"central|scanner"} + - prometheus_remote_storage_samples_total{job!~"central|scanner"} + - rest_client_request_duration_seconds_bucket{job!~"central|scanner"} + - rest_client_requests_total{job!~"central|scanner"} + - scheduler_binding_duration_seconds_bucket{job!~"central|scanner"} + - scheduler_binding_duration_seconds_count{job!~"central|scanner"} + - scheduler_e2e_scheduling_duration_seconds_bucket{job!~"central|scanner"} + - scheduler_e2e_scheduling_duration_seconds_count{job!~"central|scanner"} + - scheduler_scheduling_algorithm_duration_seconds_bucket{job!~"central|scanner"} + - scheduler_scheduling_algorithm_duration_seconds_count{job!~"central|scanner"} + - scheduler_volume_scheduling_duration_seconds_bucket{job!~"central|scanner"} + - scheduler_volume_scheduling_duration_seconds_count{job!~"central|scanner"} + - storage_operation_duration_seconds_bucket{job!~"central|scanner"} + - storage_operation_duration_seconds_count{job!~"central|scanner"} + - storage_operation_errors_total{job!~"central|scanner"} + - up{job!~"central|scanner"} + - volume_manager_total_volumes{job!~"central|scanner"} + - workqueue_adds_total{job!~"central|scanner"} + - workqueue_depth{job!~"central|scanner"} + - workqueue_queue_duration_seconds_bucket{job!~"central|scanner"} diff --git a/scripts/generate-federate-match.sh b/scripts/generate-federate-match.sh new file mode 100755 index 00000000..f6c57feb --- /dev/null +++ b/scripts/generate-federate-match.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +set -eou pipefail +shopt -s inherit_errexit + +function log() { + echo "$@" >&2 +} + +function log_exit() { + log "$@" + + exit 1 +} + +function log_requirements_and_exit() { + log_exit "ERROR: One of the required commands is not available. Please ensure that the following commands are installed: jq, yq, mimirtool, realpath, sort, and uniq" +} + +! [ -x "$(command -v jq)" ] && log_requirements_and_exit +! [ -x "$(command -v mimirtool)" ] && log_requirements_and_exit +! [ -x "$(command -v realpath)" ] && log_requirements_and_exit +! [ -x "$(command -v sort)" ] && log_requirements_and_exit +! [ -x "$(command -v uniq)" ] && log_requirements_and_exit +! [ -x "$(command -v yq)" ] && log_requirements_and_exit + +function get_rules_metrics() { + local os_prom_rule_file="${1:-}" + [[ "${os_prom_rule_file}" = "" ]] && log_exit "Variable 'os_prom_rule_file' is empty." + + local metrics_list_file="${2:-}" + [[ "${metrics_list_file}" = "" ]] && log_exit "Variable 'metrics_list_file' is empty." + + local rules_file + rules_file=$(mktemp) + + local json_file + json_file=$(mktemp) + + log "exporting federated metrics for Prometheus rules file: '${os_prom_rule_file}'" + + yq '.spec' "${os_prom_rule_file}" > "${rules_file}" + mimirtool analyze rule-file "${rules_file}" --output="${json_file}" + jq '.ruleGroups[].metrics | select( . != null ) | .[]' "${json_file}" --raw-output >> "${metrics_list_file}" + + rm -f "${rules_file}" + rm -f "${json_file}" +} + +function main() { + local script_dir + script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]:-$0}")" &>/dev/null && pwd 2>/dev/null)" + + local repo_dir + repo_dir=$(realpath "${script_dir}/../") + + local working_tmp_dir + working_tmp_dir=$(mktemp -d) + log "Created temp dir for storing temp file: '${working_tmp_dir}'" + + local metrics_list_file="${working_tmp_dir}/metrics_list" + + # Get metrics used in Dashboards + log "exporting federated metrics for Prometheus dashboards in 'grafana/sources'" + mimirtool analyze dashboard "${repo_dir}"/resources/grafana/sources/* --output="${working_tmp_dir}/acs.json" + jq '.dashboards[].metrics[]' "${working_tmp_dir}/acs.json" --raw-output >> "${metrics_list_file}" + + log "exporting federated metrics for Prometheus dashboards in 'mixins/kubernetes/generated/dashboards'" + mimirtool analyze dashboard "${repo_dir}"/resources/mixins/kubernetes/generated/dashboards/* --output="${working_tmp_dir}/mixins.json" + jq '.dashboards[].metrics[]' "${working_tmp_dir}/mixins.json" --raw-output >> "${metrics_list_file}" + + # Get metrics used in recording rules and alerts + local rules_files + rules_files=$(jq '.config.prometheus.rules[]' "${repo_dir}/resources/index.json" --raw-output) + while IFS= read -r rules_file; do + get_rules_metrics "${repo_dir}/resources/${rules_file}" "${metrics_list_file}" + done <<< "${rules_files}" + + # Filter metrics (exclude metrics that are collected by observability Prometheus or created by recording rules) + sort "${metrics_list_file}" | uniq | grep -v -E "^acs|^rox|^aws|^central:|acscs_worker_nodes" | awk '{ print $1 "{job!~\"central|scanner\"}" }' > "${metrics_list_file}.filter" + + # Create federation-config.yaml + local yq_expression='. *+ load("'"${repo_dir}/resources/prometheus/federation-config-base.yaml"'")."match[]" | unique | sort | { "match[]": . }' + sed -e 's/^/- /' "${metrics_list_file}.filter" | yq "${yq_expression}" > "${repo_dir}/resources/prometheus/federation-config.yaml" + + # Clean up the temp directory with all transient files + rm -rf "${working_tmp_dir}" + log "Deleted temp dir: '${working_tmp_dir}'" +} + +main "$@"