From aa23e6fb5b274ef7d7dad2dfde8251d7fe6c38be Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Wed, 31 Jul 2024 13:42:28 +0300 Subject: [PATCH] component: prometheus: enable/disable http jobs like rules Jira: https://issues.redhat.com/browse/RHOAIENG-87 Some jobs scrape http endpoint /probe (versus those which use kubernetes_sd_configs for example) and they start collect metrics as soon as they are configured. While enabled component's rules are deployed when deployement is available, it does not help since they fetch stale metrics from jobs. Configure them in the prometheus.yml file (field in mounted ConfigMap) similar way as `rules_files` are configured: put into separate data fields and substitute array of `scrape_configs` in prometheusContent unmarshaled map. Signed-off-by: Yauheni Kaliuta --- components/component.go | 82 ++++++++ .../prometheus/apps/prometheus-configs.yaml | 179 +++++++++--------- 2 files changed, 173 insertions(+), 88 deletions(-) diff --git a/components/component.go b/components/component.go index 39eb85a08d9..b2379f6ea5c 100644 --- a/components/component.go +++ b/components/component.go @@ -97,6 +97,73 @@ func (c *Component) ConfigComponentLogger(logger logr.Logger, component string, return logger.WithName("DSC.Components." + component) } +func getJobName(job map[any]any) (string, bool) { + nameAny, ok := job["job_name"] + if !ok { + fmt.Println("Could not fetch job_name") + return "", false + } + name, ok := nameAny.(string) + if !ok { + fmt.Println("job_name is not a string") + return "", false + } + + return name, true +} + +func getJobIdx(scrapeConfigs *[]any, jobName string) (int, bool) { + for i, j := range *scrapeConfigs { + job, ok := j.(map[any]any) + if !ok { + fmt.Println("scrape_configs element is not array") + return 0, false + } + name, ok := getJobName(job) + if ok && name == jobName { + return i, true + } + } + return 0, false +} + +func updateJob(prometheusContent *map[any]any, jobStr string, enable bool) { + var job map[any]any + + if err := yaml.Unmarshal([]byte(jobStr), &job); err != nil { + fmt.Printf("Error Unmarshaling job: %v\n", err) + return + } + + scrapeConfigsAny, ok := (*prometheusContent)["scrape_configs"] + if !ok { + fmt.Println("Could not fetch scrape_configs") + return + } + + scrapeConfigs, ok := scrapeConfigsAny.([]any) + if !ok { + fmt.Println("scrape_configs is not an array") + return + } + + name, ok := getJobName(job) + if !ok { + return + } + + idx, exists := getJobIdx(&scrapeConfigs, name) + switch { + case enable && !exists: + scrapeConfigs = append(scrapeConfigs, job) + case !enable && exists: + scrapeConfigs = append(scrapeConfigs[:idx], scrapeConfigs[idx+1:]...) + default: + return + } + (*prometheusContent)["scrape_configs"] = scrapeConfigs +} + // UpdatePrometheusConfig update prometheus-configs.yaml to include/exclude .rules // parameter enable when set to true to add new rules, when set to false to remove existing rules. func (c *Component) UpdatePrometheusConfig(_ client.Client, enable bool, component string) error { @@ -116,19 +183,23 @@ func (c *Component) UpdatePrometheusConfig(_ client.Client, enable bool, compone DeadManSnitchRules string `yaml:"deadmanssnitch-alerting.rules"` DashboardRRules string `yaml:"rhods-dashboard-recording.rules"` DashboardARules string `yaml:"rhods-dashboard-alerting.rules"` + DashboardJob string `yaml:"rhods-dashboard-job"` DSPRRules string `yaml:"data-science-pipelines-operator-recording.rules"` DSPARules string `yaml:"data-science-pipelines-operator-alerting.rules"` + DSPJob string `yaml:"data-science-pipelines-operator-job"` MMRRules string `yaml:"model-mesh-recording.rules"` MMARules string `yaml:"model-mesh-alerting.rules"` OdhModelRRules string `yaml:"odh-model-controller-recording.rules"` OdhModelARules string `yaml:"odh-model-controller-alerting.rules"` CFORRules string `yaml:"codeflare-recording.rules"` CFOARules string `yaml:"codeflare-alerting.rules"` + CFOJob string `yaml:"codeflare-job"` RayARules string `yaml:"ray-alerting.rules"` KueueARules string `yaml:"kueue-alerting.rules"` TrainingOperatorARules string `yaml:"trainingoperator-alerting.rules"` WorkbenchesRRules string `yaml:"workbenches-recording.rules"` WorkbenchesARules string `yaml:"workbenches-alerting.rules"` + WorkbenchesJob string `yaml:"workbenches-job"` TrustyAIRRules string `yaml:"trustyai-recording.rules"` TrustyAIARules string `yaml:"trustyai-alerting.rules"` KserveRRules string `yaml:"kserve-recording.rules"` @@ -179,6 +250,17 @@ func (c *Component) UpdatePrometheusConfig(_ client.Client, enable bool, compone } } + job, ok := map[string]string{ + "codeflare": configMap.Data.CFOJob, + "data-science-pipelines-operator": configMap.Data.DSPJob, + "rhods-dashboard": configMap.Data.DashboardJob, + "workbenches": configMap.Data.WorkbenchesJob, + }[component] + + if ok { + updateJob(&prometheusContent, job, enable) + } + // Marshal back newDataYAML, err := yaml.Marshal(&prometheusContent) if err != nil { diff --git a/config/monitoring/prometheus/apps/prometheus-configs.yaml b/config/monitoring/prometheus/apps/prometheus-configs.yaml index c1b25b3bfbb..3ca37d53f72 100644 --- a/config/monitoring/prometheus/apps/prometheus-configs.yaml +++ b/config/monitoring/prometheus/apps/prometheus-configs.yaml @@ -41,94 +41,6 @@ data: - targets: - "prometheus-k8s.openshift-monitoring.svc.cluster.local:9091" - - job_name: 'user_facing_endpoints_status_workbenches' - scrape_interval: 10s - metrics_path: /probe - scheme: https - tls_config: - insecure_skip_verify: true - params: - module: [http_2xx] - authorization: - credentials_file: /run/secrets/kubernetes.io/serviceaccount/token - static_configs: - - targets: [notebook-controller-service..svc:8080/metrics,odh-notebook-controller-service..svc:8080/metrics] - labels: - name: notebook-spawner - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: blackbox-exporter..svc.cluster.local:9114 - - - job_name: 'user_facing_endpoints_status_rhods_dashboard' - scrape_interval: 10s - metrics_path: /probe - scheme: https - tls_config: - insecure_skip_verify: true - params: - module: [http_2xx] - authorization: - credentials_file: /run/secrets/kubernetes.io/serviceaccount/token - static_configs: - - targets: [rhods-dashboard-.] - labels: - name: rhods-dashboard - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: blackbox-exporter..svc.cluster.local:9114 - - - job_name: 'user_facing_endpoints_status_codeflare' - scrape_interval: 10s - metrics_path: /probe - scheme: https - tls_config: - insecure_skip_verify: true - params: - module: [http_2xx] - authorization: - credentials_file: /run/secrets/kubernetes.io/serviceaccount/token - static_configs: - - targets: [codeflare-operator-manager-metrics..svc:8080/metrics] - labels: - name: codeflare-operator - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: blackbox-exporter..svc.cluster.local:9114 - - - job_name: 'user_facing_endpoints_status_dsp' - scrape_interval: 10s - metrics_path: /probe - scheme: https - tls_config: - insecure_skip_verify: true - params: - module: [http_2xx] - authorization: - credentials_file: /run/secrets/kubernetes.io/serviceaccount/token - static_configs: - - targets: [data-science-pipelines-operator-service..svc:8080/metrics] - labels: - name: data-science-pipelines-operator - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: blackbox-exporter..svc.cluster.local:9114 - - job_name: 'Kubeflow Notebook Controller Service Metrics' honor_labels: true metrics_path: /metrics @@ -571,6 +483,29 @@ data: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-absent-over-time.md' summary: Alerting for CodeFlare Operator + codeflare-job: | + job_name: 'user_facing_endpoints_status_codeflare' + scrape_interval: 10s + metrics_path: /probe + scheme: https + tls_config: + insecure_skip_verify: true + params: + module: [http_2xx] + authorization: + credentials_file: /run/secrets/kubernetes.io/serviceaccount/token + static_configs: + - targets: [codeflare-operator-manager-metrics..svc:8080/metrics] + labels: + name: codeflare-operator + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter..svc.cluster.local:9114 + trainingoperator-alerting.rules: | groups: - name: KubeFlow Training Operator @@ -784,6 +719,28 @@ data: labels: severity: warning namespace: redhat-ods-applications + rhods-dashboard-job: | + job_name: 'user_facing_endpoints_status_rhods_dashboard' + scrape_interval: 10s + metrics_path: /probe + scheme: https + tls_config: + insecure_skip_verify: true + params: + module: [http_2xx] + authorization: + credentials_file: /run/secrets/kubernetes.io/serviceaccount/token + static_configs: + - targets: [rhods-dashboard-.] + labels: + name: rhods-dashboard + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter..svc.cluster.local:9114 data-science-pipelines-operator-recording.rules: | groups: @@ -1017,6 +974,29 @@ data: severity: info namespace: redhat-ods-applications + data-science-pipelines-operator-job: | + job_name: 'user_facing_endpoints_status_dsp' + scrape_interval: 10s + metrics_path: /probe + scheme: https + tls_config: + insecure_skip_verify: true + params: + module: [http_2xx] + authorization: + credentials_file: /run/secrets/kubernetes.io/serviceaccount/token + static_configs: + - targets: [data-science-pipelines-operator-service..svc:8080/metrics] + labels: + name: data-science-pipelines-operator + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter..svc.cluster.local:9114 + model-mesh-recording.rules: | groups: - name: SLOs - Modelmesh Controller @@ -1490,6 +1470,29 @@ data: severity: warning instance: notebook-spawner + workbenches-job: | + job_name: 'user_facing_endpoints_status_workbenches' + scrape_interval: 10s + metrics_path: /probe + scheme: https + tls_config: + insecure_skip_verify: true + params: + module: [http_2xx] + authorization: + credentials_file: /run/secrets/kubernetes.io/serviceaccount/token + static_configs: + - targets: [notebook-controller-service..svc:8080/metrics,odh-notebook-controller-service..svc:8080/metrics] + labels: + name: notebook-spawner + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter..svc.cluster.local:9114 + trustyai-recording.rules: | groups: - name: SLOs - TrustyAI Controller Manager