diff --git a/components/component.go b/components/component.go index 39eb85a08d9..b2379f6ea5c 100644 --- a/components/component.go +++ b/components/component.go @@ -97,6 +97,73 @@ func (c *Component) ConfigComponentLogger(logger logr.Logger, component string, return logger.WithName("DSC.Components." + component) } +func getJobName(job map[any]any) (string, bool) { + nameAny, ok := job["job_name"] + if !ok { + fmt.Println("Could not fetch job_name") + return "", false + } + name, ok := nameAny.(string) + if !ok { + fmt.Println("job_name is not a string") + return "", false + } + + return name, true +} + +func getJobIdx(scrapeConfigs *[]any, jobName string) (int, bool) { + for i, j := range *scrapeConfigs { + job, ok := j.(map[any]any) + if !ok { + fmt.Println("scrape_configs element is not array") + return 0, false + } + name, ok := getJobName(job) + if ok && name == jobName { + return i, true + } + } + return 0, false +} + +func updateJob(prometheusContent *map[any]any, jobStr string, enable bool) { + var job map[any]any + + if err := yaml.Unmarshal([]byte(jobStr), &job); err != nil { + fmt.Printf("Error Unmarshaling job: %v\n", err) + return + } + + scrapeConfigsAny, ok := (*prometheusContent)["scrape_configs"] + if !ok { + fmt.Println("Could not fetch scrape_configs") + return + } + + scrapeConfigs, ok := scrapeConfigsAny.([]any) + if !ok { + fmt.Println("scrape_configs is not an array") + return + } + + name, ok := getJobName(job) + if !ok { + return + } + + idx, exists := getJobIdx(&scrapeConfigs, name) + switch { + case enable && !exists: + scrapeConfigs = append(scrapeConfigs, job) + case !enable && exists: + scrapeConfigs = append(scrapeConfigs[:idx], scrapeConfigs[idx+1:]...) + default: + return + } + (*prometheusContent)["scrape_configs"] = scrapeConfigs +} + // UpdatePrometheusConfig update prometheus-configs.yaml to include/exclude .rules // parameter enable when set to true to add new rules, when set to false to remove existing rules. func (c *Component) UpdatePrometheusConfig(_ client.Client, enable bool, component string) error { @@ -116,19 +183,23 @@ func (c *Component) UpdatePrometheusConfig(_ client.Client, enable bool, compone DeadManSnitchRules string `yaml:"deadmanssnitch-alerting.rules"` DashboardRRules string `yaml:"rhods-dashboard-recording.rules"` DashboardARules string `yaml:"rhods-dashboard-alerting.rules"` + DashboardJob string `yaml:"rhods-dashboard-job"` DSPRRules string `yaml:"data-science-pipelines-operator-recording.rules"` DSPARules string `yaml:"data-science-pipelines-operator-alerting.rules"` + DSPJob string `yaml:"data-science-pipelines-operator-job"` MMRRules string `yaml:"model-mesh-recording.rules"` MMARules string `yaml:"model-mesh-alerting.rules"` OdhModelRRules string `yaml:"odh-model-controller-recording.rules"` OdhModelARules string `yaml:"odh-model-controller-alerting.rules"` CFORRules string `yaml:"codeflare-recording.rules"` CFOARules string `yaml:"codeflare-alerting.rules"` + CFOJob string `yaml:"codeflare-job"` RayARules string `yaml:"ray-alerting.rules"` KueueARules string `yaml:"kueue-alerting.rules"` TrainingOperatorARules string `yaml:"trainingoperator-alerting.rules"` WorkbenchesRRules string `yaml:"workbenches-recording.rules"` WorkbenchesARules string `yaml:"workbenches-alerting.rules"` + WorkbenchesJob string `yaml:"workbenches-job"` TrustyAIRRules string `yaml:"trustyai-recording.rules"` TrustyAIARules string `yaml:"trustyai-alerting.rules"` KserveRRules string `yaml:"kserve-recording.rules"` @@ -179,6 +250,17 @@ func (c *Component) UpdatePrometheusConfig(_ client.Client, enable bool, compone } } + job, ok := map[string]string{ + "codeflare": configMap.Data.CFOJob, + "data-science-pipelines-operator": configMap.Data.DSPJob, + "rhods-dashboard": configMap.Data.DashboardJob, + "workbenches": configMap.Data.WorkbenchesJob, + }[component] + + if ok { + updateJob(&prometheusContent, job, enable) + } + // Marshal back newDataYAML, err := yaml.Marshal(&prometheusContent) if err != nil { diff --git a/config/monitoring/prometheus/apps/prometheus-configs.yaml b/config/monitoring/prometheus/apps/prometheus-configs.yaml index c1b25b3bfbb..3ca37d53f72 100644 --- a/config/monitoring/prometheus/apps/prometheus-configs.yaml +++ b/config/monitoring/prometheus/apps/prometheus-configs.yaml @@ -41,94 +41,6 @@ data: - targets: - "prometheus-k8s.openshift-monitoring.svc.cluster.local:9091" - - job_name: 'user_facing_endpoints_status_workbenches' - scrape_interval: 10s - metrics_path: /probe - scheme: https - tls_config: - insecure_skip_verify: true - params: - module: [http_2xx] - authorization: - credentials_file: /run/secrets/kubernetes.io/serviceaccount/token - static_configs: - - targets: [notebook-controller-service..svc:8080/metrics,odh-notebook-controller-service..svc:8080/metrics] - labels: - name: notebook-spawner - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: blackbox-exporter..svc.cluster.local:9114 - - - job_name: 'user_facing_endpoints_status_rhods_dashboard' - scrape_interval: 10s - metrics_path: /probe - scheme: https - tls_config: - insecure_skip_verify: true - params: - module: [http_2xx] - authorization: - credentials_file: /run/secrets/kubernetes.io/serviceaccount/token - static_configs: - - targets: [rhods-dashboard-.] - labels: - name: rhods-dashboard - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: blackbox-exporter..svc.cluster.local:9114 - - - job_name: 'user_facing_endpoints_status_codeflare' - scrape_interval: 10s - metrics_path: /probe - scheme: https - tls_config: - insecure_skip_verify: true - params: - module: [http_2xx] - authorization: - credentials_file: /run/secrets/kubernetes.io/serviceaccount/token - static_configs: - - targets: [codeflare-operator-manager-metrics..svc:8080/metrics] - labels: - name: codeflare-operator - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: blackbox-exporter..svc.cluster.local:9114 - - - job_name: 'user_facing_endpoints_status_dsp' - scrape_interval: 10s - metrics_path: /probe - scheme: https - tls_config: - insecure_skip_verify: true - params: - module: [http_2xx] - authorization: - credentials_file: /run/secrets/kubernetes.io/serviceaccount/token - static_configs: - - targets: [data-science-pipelines-operator-service..svc:8080/metrics] - labels: - name: data-science-pipelines-operator - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: blackbox-exporter..svc.cluster.local:9114 - - job_name: 'Kubeflow Notebook Controller Service Metrics' honor_labels: true metrics_path: /metrics @@ -571,6 +483,29 @@ data: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-absent-over-time.md' summary: Alerting for CodeFlare Operator + codeflare-job: | + job_name: 'user_facing_endpoints_status_codeflare' + scrape_interval: 10s + metrics_path: /probe + scheme: https + tls_config: + insecure_skip_verify: true + params: + module: [http_2xx] + authorization: + credentials_file: /run/secrets/kubernetes.io/serviceaccount/token + static_configs: + - targets: [codeflare-operator-manager-metrics..svc:8080/metrics] + labels: + name: codeflare-operator + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter..svc.cluster.local:9114 + trainingoperator-alerting.rules: | groups: - name: KubeFlow Training Operator @@ -784,6 +719,28 @@ data: labels: severity: warning namespace: redhat-ods-applications + rhods-dashboard-job: | + job_name: 'user_facing_endpoints_status_rhods_dashboard' + scrape_interval: 10s + metrics_path: /probe + scheme: https + tls_config: + insecure_skip_verify: true + params: + module: [http_2xx] + authorization: + credentials_file: /run/secrets/kubernetes.io/serviceaccount/token + static_configs: + - targets: [rhods-dashboard-.] + labels: + name: rhods-dashboard + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter..svc.cluster.local:9114 data-science-pipelines-operator-recording.rules: | groups: @@ -1017,6 +974,29 @@ data: severity: info namespace: redhat-ods-applications + data-science-pipelines-operator-job: | + job_name: 'user_facing_endpoints_status_dsp' + scrape_interval: 10s + metrics_path: /probe + scheme: https + tls_config: + insecure_skip_verify: true + params: + module: [http_2xx] + authorization: + credentials_file: /run/secrets/kubernetes.io/serviceaccount/token + static_configs: + - targets: [data-science-pipelines-operator-service..svc:8080/metrics] + labels: + name: data-science-pipelines-operator + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter..svc.cluster.local:9114 + model-mesh-recording.rules: | groups: - name: SLOs - Modelmesh Controller @@ -1490,6 +1470,29 @@ data: severity: warning instance: notebook-spawner + workbenches-job: | + job_name: 'user_facing_endpoints_status_workbenches' + scrape_interval: 10s + metrics_path: /probe + scheme: https + tls_config: + insecure_skip_verify: true + params: + module: [http_2xx] + authorization: + credentials_file: /run/secrets/kubernetes.io/serviceaccount/token + static_configs: + - targets: [notebook-controller-service..svc:8080/metrics,odh-notebook-controller-service..svc:8080/metrics] + labels: + name: notebook-spawner + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter..svc.cluster.local:9114 + trustyai-recording.rules: | groups: - name: SLOs - TrustyAI Controller Manager