red-hat-data-services · ykaliuta · Jul 31, 2024 · zdtsw · Aug 1, 2024 · ykaliuta
diff --git a/components/component.go b/components/component.go
@@ -97,6 +97,73 @@ func (c *Component) ConfigComponentLogger(logger logr.Logger, component string,
 	return logger.WithName("DSC.Components." + component)
 }
 
+func getJobName(job map[any]any) (string, bool) {
+	nameAny, ok := job["job_name"]
+	if !ok {
+		fmt.Println("Could not fetch job_name")
+		return "", false
+	}
+	name, ok := nameAny.(string)
+	if !ok {
+		fmt.Println("job_name is not a string")
+		return "", false
+	}
+
+	return name, true
+}
+
+func getJobIdx(scrapeConfigs *[]any, jobName string) (int, bool) {
+	for i, j := range *scrapeConfigs {
+		job, ok := j.(map[any]any)
+		if !ok {
+			fmt.Println("scrape_configs element is not array")
+			return 0, false
+		}
+		name, ok := getJobName(job)
+		if ok && name == jobName {
+			return i, true
+		}
+	}
+	return 0, false
+}
+
+func updateJob(prometheusContent *map[any]any, jobStr string, enable bool) {
+	var job map[any]any
+
+	if err := yaml.Unmarshal([]byte(jobStr), &job); err != nil {
+		fmt.Printf("Error Unmarshaling job: %v\n", err)
+		return
+	}
+
+	scrapeConfigsAny, ok := (*prometheusContent)["scrape_configs"]
+	if !ok {
+		fmt.Println("Could not fetch scrape_configs")
+		return
+	}
+
+	scrapeConfigs, ok := scrapeConfigsAny.([]any)
+	if !ok {
+		fmt.Println("scrape_configs is not an array")
+		return
+	}
+
+	name, ok := getJobName(job)
+	if !ok {
+		return
+	}
+
+	idx, exists := getJobIdx(&scrapeConfigs, name)
+	switch {
+	case enable && !exists:
+		scrapeConfigs = append(scrapeConfigs, job)
+	case !enable && exists:
+		scrapeConfigs = append(scrapeConfigs[:idx], scrapeConfigs[idx+1:]...)
+	default:
+		return
+	}
+	(*prometheusContent)["scrape_configs"] = scrapeConfigs
+}
+
 // UpdatePrometheusConfig update prometheus-configs.yaml to include/exclude <component>.rules
 // parameter enable when set to true to add new rules, when set to false to remove existing rules.
 func (c *Component) UpdatePrometheusConfig(_ client.Client, enable bool, component string) error {
@@ -116,19 +183,23 @@ func (c *Component) UpdatePrometheusConfig(_ client.Client, enable bool, compone
 			DeadManSnitchRules     string `yaml:"deadmanssnitch-alerting.rules"`
 			DashboardRRules        string `yaml:"rhods-dashboard-recording.rules"`
 			DashboardARules        string `yaml:"rhods-dashboard-alerting.rules"`
+			DashboardJob           string `yaml:"rhods-dashboard-job"`
 			DSPRRules              string `yaml:"data-science-pipelines-operator-recording.rules"`
 			DSPARules              string `yaml:"data-science-pipelines-operator-alerting.rules"`
+			DSPJob                 string `yaml:"data-science-pipelines-operator-job"`
 			MMRRules               string `yaml:"model-mesh-recording.rules"`
 			MMARules               string `yaml:"model-mesh-alerting.rules"`
 			OdhModelRRules         string `yaml:"odh-model-controller-recording.rules"`
 			OdhModelARules         string `yaml:"odh-model-controller-alerting.rules"`
 			CFORRules              string `yaml:"codeflare-recording.rules"`
 			CFOARules              string `yaml:"codeflare-alerting.rules"`
+			CFOJob                 string `yaml:"codeflare-job"`
 			RayARules              string `yaml:"ray-alerting.rules"`
 			KueueARules            string `yaml:"kueue-alerting.rules"`
 			TrainingOperatorARules string `yaml:"trainingoperator-alerting.rules"`
 			WorkbenchesRRules      string `yaml:"workbenches-recording.rules"`
 			WorkbenchesARules      string `yaml:"workbenches-alerting.rules"`
+			WorkbenchesJob         string `yaml:"workbenches-job"`
 			TrustyAIRRules         string `yaml:"trustyai-recording.rules"`
 			TrustyAIARules         string `yaml:"trustyai-alerting.rules"`
 			KserveRRules           string `yaml:"kserve-recording.rules"`
@@ -179,6 +250,17 @@ func (c *Component) UpdatePrometheusConfig(_ client.Client, enable bool, compone
 		}
 	}
 
+	job, ok := map[string]string{
+		"codeflare":                       configMap.Data.CFOJob,
+		"data-science-pipelines-operator": configMap.Data.DSPJob,
+		"rhods-dashboard":                 configMap.Data.DashboardJob,
+		"workbenches":                     configMap.Data.WorkbenchesJob,
+	}[component]
+
+	if ok {
+		updateJob(&prometheusContent, job, enable)
+	}
+
 	// Marshal back
 	newDataYAML, err := yaml.Marshal(&prometheusContent)
 	if err != nil {

diff --git a/config/monitoring/prometheus/apps/prometheus-configs.yaml b/config/monitoring/prometheus/apps/prometheus-configs.yaml
@@ -41,94 +41,6 @@ data:
         - targets:
           - "prometheus-k8s.openshift-monitoring.svc.cluster.local:9091"
 
-    - job_name: 'user_facing_endpoints_status_workbenches'
-      scrape_interval: 10s
-      metrics_path: /probe
-      scheme: https
-      tls_config:
-        insecure_skip_verify: true
-      params:
-        module: [http_2xx]
-      authorization:
-        credentials_file: /run/secrets/kubernetes.io/serviceaccount/token
-      static_configs:
-      - targets: [notebook-controller-service.<odh_application_namespace>.svc:8080/metrics,odh-notebook-controller-service.<odh_application_namespace>.svc:8080/metrics]
-        labels:
-          name: notebook-spawner
-      relabel_configs:
-      - source_labels: [__address__]
-        target_label: __param_target
-      - source_labels: [__param_target]
-        target_label: instance
-      - target_label: __address__
-        replacement: blackbox-exporter.<odh_monitoring_project>.svc.cluster.local:9114
-
-    - job_name: 'user_facing_endpoints_status_rhods_dashboard'
-      scrape_interval: 10s
-      metrics_path: /probe
-      scheme: https
-      tls_config:
-        insecure_skip_verify: true
-      params:
-        module: [http_2xx]
-      authorization:
-        credentials_file: /run/secrets/kubernetes.io/serviceaccount/token
-      static_configs:
-      - targets: [rhods-dashboard-<odh_application_namespace>.<console_domain>]
-        labels:
-          name: rhods-dashboard
-      relabel_configs:
-      - source_labels: [__address__]
-        target_label: __param_target
-      - source_labels: [__param_target]
-        target_label: instance
-      - target_label: __address__
-        replacement: blackbox-exporter.<odh_monitoring_project>.svc.cluster.local:9114
-
-    - job_name: 'user_facing_endpoints_status_codeflare'
-      scrape_interval: 10s
-      metrics_path: /probe
-      scheme: https
-      tls_config:
-        insecure_skip_verify: true
-      params:
-        module: [http_2xx]
-      authorization:
-        credentials_file: /run/secrets/kubernetes.io/serviceaccount/token
-      static_configs:
-      - targets: [codeflare-operator-manager-metrics.<odh_application_namespace>.svc:8080/metrics]
-        labels:
-          name: codeflare-operator
-      relabel_configs:
-      - source_labels: [__address__]
-        target_label: __param_target
-      - source_labels: [__param_target]
-        target_label: instance
-      - target_label: __address__
-        replacement: blackbox-exporter.<odh_monitoring_project>.svc.cluster.local:9114
-
-    - job_name: 'user_facing_endpoints_status_dsp'
-      scrape_interval: 10s
-      metrics_path: /probe
-      scheme: https
-      tls_config:
-        insecure_skip_verify: true
-      params:
-        module: [http_2xx]
-      authorization:
-        credentials_file: /run/secrets/kubernetes.io/serviceaccount/token
-      static_configs:
-      - targets: [data-science-pipelines-operator-service.<odh_application_namespace>.svc:8080/metrics]
-        labels:
-          name: data-science-pipelines-operator
-      relabel_configs:
-      - source_labels: [__address__]
-        target_label: __param_target
-      - source_labels: [__param_target]
-        target_label: instance
-      - target_label: __address__
-        replacement: blackbox-exporter.<odh_monitoring_project>.svc.cluster.local:9114
-
     - job_name: 'Kubeflow Notebook Controller Service Metrics'
       honor_labels: true
       metrics_path: /metrics
@@ -571,6 +483,29 @@ data:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-absent-over-time.md'
               summary: Alerting for CodeFlare Operator
 
+  codeflare-job: |
+    job_name: 'user_facing_endpoints_status_codeflare'
+    scrape_interval: 10s
+    metrics_path: /probe
+    scheme: https
+    tls_config:
+      insecure_skip_verify: true
+    params:
+      module: [http_2xx]
+    authorization:
+      credentials_file: /run/secrets/kubernetes.io/serviceaccount/token
+    static_configs:
+    - targets: [codeflare-operator-manager-metrics.<odh_application_namespace>.svc:8080/metrics]
+      labels:
+        name: codeflare-operator
+    relabel_configs:
+    - source_labels: [__address__]
+      target_label: __param_target
+    - source_labels: [__param_target]
+      target_label: instance
+    - target_label: __address__
+      replacement: blackbox-exporter.<odh_monitoring_project>.svc.cluster.local:9114
+
   trainingoperator-alerting.rules: |
     groups:
     - name: KubeFlow Training Operator
@@ -784,6 +719,28 @@ data:
           labels:
             severity: warning
             namespace: redhat-ods-applications
+  rhods-dashboard-job: |
+    job_name: 'user_facing_endpoints_status_rhods_dashboard'
+    scrape_interval: 10s
+    metrics_path: /probe
+    scheme: https
+    tls_config:
+      insecure_skip_verify: true
+    params:
+      module: [http_2xx]
+    authorization:
+      credentials_file: /run/secrets/kubernetes.io/serviceaccount/token
+    static_configs:
+    - targets: [rhods-dashboard-<odh_application_namespace>.<console_domain>]
+      labels:
+        name: rhods-dashboard
+    relabel_configs:
+    - source_labels: [__address__]
+      target_label: __param_target
+    - source_labels: [__param_target]
+      target_label: instance
+    - target_label: __address__
+      replacement: blackbox-exporter.<odh_monitoring_project>.svc.cluster.local:9114
 
   data-science-pipelines-operator-recording.rules: |
     groups:
@@ -1017,6 +974,29 @@ data:
             severity: info
             namespace: redhat-ods-applications
 
+  data-science-pipelines-operator-job: |
+    job_name: 'user_facing_endpoints_status_dsp'
+    scrape_interval: 10s
+    metrics_path: /probe
+    scheme: https
+    tls_config:
+      insecure_skip_verify: true
+    params:
+      module: [http_2xx]
+    authorization:
+      credentials_file: /run/secrets/kubernetes.io/serviceaccount/token
+    static_configs:
+    - targets: [data-science-pipelines-operator-service.<odh_application_namespace>.svc:8080/metrics]
+      labels:
+        name: data-science-pipelines-operator
+    relabel_configs:
+    - source_labels: [__address__]
+      target_label: __param_target
+    - source_labels: [__param_target]
+      target_label: instance
+    - target_label: __address__
+      replacement: blackbox-exporter.<odh_monitoring_project>.svc.cluster.local:9114
+
   model-mesh-recording.rules: |
     groups:
       - name: SLOs - Modelmesh Controller
@@ -1490,6 +1470,29 @@ data:
             severity: warning
             instance: notebook-spawner
 
+  workbenches-job: |
+      job_name: 'user_facing_endpoints_status_workbenches'
+      scrape_interval: 10s
+      metrics_path: /probe
+      scheme: https
+      tls_config:
+        insecure_skip_verify: true
+      params:
+        module: [http_2xx]
+      authorization:
+        credentials_file: /run/secrets/kubernetes.io/serviceaccount/token
+      static_configs:
+      - targets: [notebook-controller-service.<odh_application_namespace>.svc:8080/metrics,odh-notebook-controller-service.<odh_application_namespace>.svc:8080/metrics]
+        labels:
+          name: notebook-spawner
+      relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter.<odh_monitoring_project>.svc.cluster.local:9114
+
   trustyai-recording.rules: |
     groups:
       - name: SLOs - TrustyAI Controller Manager