diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index b8c2248..e6b0fdf 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,9 +1,12 @@ name: ci on: + push: + branches: + - master pull_request: types: - - closed + - ready_for_review jobs: diff --git a/README.md b/README.md index 5d85026..a5548a1 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ issue [Here](https://github.com/kubernetes/kubernetes/issues/69507) ```bash helm repo add k8s-ephemeral-storage-metrics https://jmcgrath207.github.io/k8s-ephemeral-storage-metrics/chart +helm repo update helm upgrade --install my-deployment k8s-ephemeral-storage-metrics/k8s-ephemeral-storage-metrics ``` @@ -22,7 +23,7 @@ helm upgrade --install my-deployment k8s-ephemeral-storage-metrics/k8s-ephemeral | Key | Type | Default | Description | |-----|------|---------|-------------| -| deploy_type | string | `"DaemonSet"` | | +| deploy_type | string | `"Deployment"` | Set as Deployment for single controller to query all nodes or Daemonset | | dev.enabled | bool | `false` | | | extra.adjusted_polling_rate | bool | `false` | Create the ephemeral_storage_adjusted_polling_rate metrics to report Adjusted Poll Rate in milliseconds. Typically used for testing. | | image.imagePullPolicy | string | `"IfNotPresent"` | | diff --git a/chart/Chart.yaml b/chart/Chart.yaml index f831596..57a9a7d 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: k8s-ephemeral-storage-metrics -version: 1.0.2 -appVersion: 1.0.2 +version: 1.1.0 +appVersion: 1.1.0 kubeVersion: ">=1.21.0-0" description: Ephemeral storage metrics for prometheus operator. home: https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics diff --git a/chart/README.md b/chart/README.md index 8f36380..80c2f3e 100644 --- a/chart/README.md +++ b/chart/README.md @@ -2,6 +2,7 @@ ```bash helm repo add k8s-ephemeral-storage-metrics https://jmcgrath207.github.io/k8s-ephemeral-storage-metrics/chart +helm repo update helm upgrade --install my-deployment k8s-ephemeral-storage-metrics/k8s-ephemeral-storage-metrics ``` @@ -9,7 +10,7 @@ helm upgrade --install my-deployment k8s-ephemeral-storage-metrics/k8s-ephemeral | Key | Type | Default | Description | |-----|------|---------|-------------| -| deploy_type | string | `"DaemonSet"` | | +| deploy_type | string | `"Deployment"` | Set as Deployment for single controller to query all nodes or Daemonset | | dev.enabled | bool | `false` | | | extra.adjusted_polling_rate | bool | `false` | Create the ephemeral_storage_adjusted_polling_rate metrics to report Adjusted Poll Rate in milliseconds. Typically used for testing. | | image.imagePullPolicy | string | `"IfNotPresent"` | | diff --git a/chart/README.md.gotmpl b/chart/README.md.gotmpl index f0a42fe..6b6d89b 100644 --- a/chart/README.md.gotmpl +++ b/chart/README.md.gotmpl @@ -2,6 +2,7 @@ ```bash helm repo add k8s-ephemeral-storage-metrics https://jmcgrath207.github.io/k8s-ephemeral-storage-metrics/chart +helm repo update helm upgrade --install my-deployment k8s-ephemeral-storage-metrics/k8s-ephemeral-storage-metrics ``` diff --git a/chart/index.yaml b/chart/index.yaml index 924fb67..03f5bf2 100644 --- a/chart/index.yaml +++ b/chart/index.yaml @@ -1,6 +1,28 @@ apiVersion: v1 entries: k8s-ephemeral-storage-metrics: + - annotations: + artifacthub.io/license: MIT + artifacthub.io/links: | + - name: Documentation + url: https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics + artifacthub.io/prerelease: "false" + apiVersion: v2 + appVersion: 1.1.0 + created: "2023-10-22T19:40:25.810173641-05:00" + description: Ephemeral storage metrics for prometheus operator. + digest: fe6be3c20af159cb4e7adb25fe96fe578d07fa49a6f8af5097c334571f644ec4 + home: https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics + keywords: + - kubernetes + - metrics + kubeVersion: '>=1.21.0-0' + name: k8s-ephemeral-storage-metrics + sources: + - https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics + urls: + - https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics/releases/download/1.1.0/k8s-ephemeral-storage-metrics-1.1.0.tgz + version: 1.1.0 - annotations: artifacthub.io/license: MIT artifacthub.io/links: | @@ -67,4 +89,4 @@ entries: urls: - https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics/releases/download/1.0.0/k8s-ephemeral-storage-metrics-1.0.0.tgz version: 1.0.0 -generated: "2023-10-20T13:24:12.479459598-05:00" +generated: "2023-10-22T19:40:25.809884043-05:00" diff --git a/chart/templates/DeployType.yaml b/chart/templates/DeployType.yaml index 1cb6430..b9234df 100644 --- a/chart/templates/DeployType.yaml +++ b/chart/templates/DeployType.yaml @@ -53,6 +53,8 @@ spec: successThreshold: 1 timeoutSeconds: 1 env: + - name: DEPLOY_TYPE + value: "{{ .Values.deploy_type }}" - name: SCRAPE_INTERVAL value: "{{ .Values.interval }}" - name: LOG_LEVEL @@ -61,8 +63,10 @@ spec: - name: ADJUSTED_POLLING_RATE value: "{{ .Values.extra.adjusted_polling_rate }}" {{ end }} + {{ if eq .Values.deploy_type "DaemonSet" }} - name: CURRENT_NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName + {{ end }} diff --git a/chart/templates/RBAC.yaml b/chart/templates/RBAC.yaml index a5d5cd2..898843a 100644 --- a/chart/templates/RBAC.yaml +++ b/chart/templates/RBAC.yaml @@ -7,8 +7,8 @@ metadata: {{- include "chart.labels" . | nindent 4 }} rules: - apiGroups: [""] - resources: ["nodes/proxy"] - verbs: ["get"] + resources: ["nodes","nodes/proxy"] + verbs: ["get","list"] --- diff --git a/chart/values.yaml b/chart/values.yaml index 36298e1..6eee1ac 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -1,9 +1,10 @@ image: repository: ghcr.io/jmcgrath207/k8s-ephemeral-storage-metrics - tag: 1.0.2 + tag: 1.1.0 imagePullPolicy: IfNotPresent log_level: info -deploy_type: DaemonSet +# -- Set as Deployment for single controller to query all nodes or Daemonset +deploy_type: Deployment # Note in testing, Kube API does not refresh faster than 10 seconds # -- Polling rate for exporter interval: 15 # Seconds diff --git a/main.go b/main.go index 788c7bf..6a9f5f9 100644 --- a/main.go +++ b/main.go @@ -9,6 +9,7 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/rs/zerolog" "github.com/rs/zerolog/log" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" @@ -18,16 +19,19 @@ import ( "path/filepath" "runtime" "strconv" + "sync" "time" ) var ( inCluster string clientset *kubernetes.Clientset - currentNode string sampleInterval int64 adjustedPollingRate bool adjustedTimeGauge prometheus.Gauge + deployType string + nodeSlice []string + nodeWaitGroup sync.WaitGroup ) func getEnv(key, fallback string) string { @@ -97,8 +101,34 @@ type ephemeralStorageMetrics struct { } } -func getMetrics() { +func getNodes() { + nodeWaitGroup.Add(1) + if deployType != "Deployment" { + nodeSlice = append(nodeSlice, getEnv("CURRENT_NODE_NAME", "")) + nodeWaitGroup.Done() + return + } + for { + nodeSlice = nil + nodes, _ := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + for _, node := range nodes.Items { + nodeSlice = append(nodeSlice, node.Name) + } + nodeWaitGroup.Done() + time.Sleep(1 * time.Minute) + nodeWaitGroup.Add(1) + } + +} + +type CollectMetric struct { + usedBytes float64 + labels prometheus.Labels +} +func getMetrics() { + nodeWaitGroup.Wait() + var labelsList []CollectMetric opsQueued := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "ephemeral_storage_pod_usage", Help: "Used to expose Ephemeral Storage metrics for pod in bytes ", @@ -116,7 +146,6 @@ func getMetrics() { prometheus.MustRegister(opsQueued) log.Debug().Msg(fmt.Sprintf("getMetrics has been invoked")) - currentNode = getEnv("CURRENT_NODE_NAME", "") if adjustedPollingRate { adjustedTimeGauge = prometheus.NewGauge(prometheus.GaugeOpts{ @@ -132,37 +161,55 @@ func getMetrics() { for { start := time.Now() - content, err := clientset.RESTClient().Get().AbsPath(fmt.Sprintf("/api/v1/nodes/%s/proxy/stats/summary", currentNode)).DoRaw(context.Background()) - if err != nil { - log.Error().Msg(fmt.Sprintf("ErrorBadRequst : %s\n", err.Error())) - os.Exit(1) - } - log.Debug().Msg(fmt.Sprintf("Fetched proxy stats from node : %s", currentNode)) - var data ephemeralStorageMetrics - _ = json.Unmarshal(content, &data) - - opsQueued.Reset() // reset this metrics in the Exporter to flush dead pods - - nodeName := data.Node.NodeName - for _, pod := range data.Pods { - podName := pod.PodRef.Name - podNamespace := pod.PodRef.Namespace - usedBytes := pod.EphemeralStorage.UsedBytes - if podNamespace == "" || (usedBytes == 0 && pod.EphemeralStorage.AvailableBytes == 0 && pod.EphemeralStorage.CapacityBytes == 0) { - log.Warn().Msg(fmt.Sprintf("pod %s/%s on %s has no metrics on its ephemeral storage usage", podName, podNamespace, nodeName)) - log.Warn().Msg(fmt.Sprintf("raw content %v", content)) + for _, node := range nodeSlice { + + content, err := clientset.RESTClient().Get().AbsPath(fmt.Sprintf("/api/v1/nodes/%s/proxy/stats/summary", node)).DoRaw(context.Background()) + if err != nil { + log.Error().Msg(fmt.Sprintf("ErrorBadRequst : %s\n", err.Error())) + os.Exit(1) } - opsQueued.With(prometheus.Labels{"pod_namespace": podNamespace, - "pod_name": podName, "node_name": nodeName}).Set(usedBytes) - if adjustedPollingRate { - adjustedTimeGauge.Set(float64(adjustTime)) + log.Debug().Msg(fmt.Sprintf("Fetched proxy stats from node : %s", node)) + var data ephemeralStorageMetrics + _ = json.Unmarshal(content, &data) + + nodeName := data.Node.NodeName + for _, pod := range data.Pods { + podName := pod.PodRef.Name + podNamespace := pod.PodRef.Namespace + usedBytes := pod.EphemeralStorage.UsedBytes + if podNamespace == "" || (usedBytes == 0 && pod.EphemeralStorage.AvailableBytes == 0 && pod.EphemeralStorage.CapacityBytes == 0) { + log.Warn().Msg(fmt.Sprintf("pod %s/%s on %s has no metrics on its ephemeral storage usage", podName, podNamespace, nodeName)) + log.Warn().Msg(fmt.Sprintf("raw content %v", content)) + } + labelsList = append(labelsList, CollectMetric{ + usedBytes, + prometheus.Labels{"pod_namespace": podNamespace, + "pod_name": podName, "node_name": nodeName}, + }) + + log.Debug().Msg(fmt.Sprintf("pod %s/%s on %s with usedBytes: %f", podNamespace, podName, nodeName, usedBytes)) } + } - log.Debug().Msg(fmt.Sprintf("pod %s/%s on %s with usedBytes: %f", podNamespace, podName, nodeName, usedBytes)) + // reset this metrics in the Exporter to flush dead pods + opsQueued.Reset() + // Push new metrics to exporter + for _, x := range labelsList { + opsQueued.With(x.labels).Set(x.usedBytes) } + // Zero out collection list + labelsList = nil elapsedTime := time.Now().Sub(start).Milliseconds() adjustTime = sampleInterval - elapsedTime + if adjustTime <= 0.0 { + log.Error().Msgf("Adjusted Poll Rate: %d ms", adjustTime) + log.Error().Msgf("Polling Rate could not keep up. Adjust your Interval to a higher number than %d", sampleInterval) + os.Exit(1) + } + if adjustedPollingRate { + adjustedTimeGauge.Set(float64(adjustTime)) + } log.Debug().Msgf("Adjusted Poll Rate: %d ms", adjustTime) time.Sleep(time.Duration(adjustTime) * time.Millisecond) } @@ -193,9 +240,11 @@ func main() { flag.Parse() setLogger() getK8sClient() + go getNodes() go getMetrics() port := getEnv("METRICS_PORT", "9100") adjustedPollingRate, _ = strconv.ParseBool(getEnv("ADJUSTED_POLLING_RATE", "false")) + deployType = getEnv("DEPLOY_TYPE", "DaemonSet") http.Handle("/metrics", promhttp.Handler()) log.Info().Msg(fmt.Sprintf("Starting server listening on :%s", port)) err := http.ListenAndServe(fmt.Sprintf(":%s", port), nil) diff --git a/scripts/create_kind.sh b/scripts/create_kind.sh index c862c0c..d294e25 100755 --- a/scripts/create_kind.sh +++ b/scripts/create_kind.sh @@ -24,7 +24,7 @@ kubectl get nodes -o wide # Deploy Service Monitor CRD kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.65.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml -if [[ ! $ENV =~ "e2e" ]]; then +if ! [[ $ENV =~ "e2e" ]]; then # Deploy Prometheus helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update diff --git a/tests/e2e/deployment_test.go b/tests/e2e/deployment_test.go index 7bb6f1b..899b3ba 100644 --- a/tests/e2e/deployment_test.go +++ b/tests/e2e/deployment_test.go @@ -179,7 +179,8 @@ var _ = ginkgo.Describe("Test Metrics\n", func() { ginkgo.Specify("\nReturn A Record IP addresses and Proxy IP address", func() { var checkSlice []string checkSlice = append(checkSlice, "ephemeral_storage_pod_usage", - "pod_name=\"k8s-ephemeral-storage", "ephemeral_storage_adjusted_polling_rate") + "pod_name=\"k8s-ephemeral-storage", "ephemeral_storage_adjusted_polling_rate", + "node_name=\"ephemeral-metrics-cluster-worker", "node_name=\"ephemeral-metrics-cluster-control-plane") checkPrometheus(checkSlice) }) })