diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index bda89c1..b4c9ced 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -2,6 +2,11 @@ name: e2e on: push: + branches: + - 'master' + pull_request: + types: [opened, reopened] + jobs: e2e: diff --git a/Makefile b/Makefile index f9c0ac6..18fda11 100644 --- a/Makefile +++ b/Makefile @@ -58,10 +58,10 @@ release-helm: cd .. release: github_login release-docker release-helm helm-docs - # ex. make VERSION=1.3.1 release + # ex. make VERSION=1.4.0 release release-github: github_login - # ex. make VERSION=1.3.1 release-github + # ex. make VERSION=1.4.0 release-github gh release create ${VERSION} --generate-notes gh release upload ${VERSION} "chart/k8s-ephemeral-storage-metrics-${VERSION}.tgz" rm chart/k8s-ephemeral-storage-metrics-*.tgz diff --git a/README.md b/README.md index b0d01b2..b82aa09 100644 --- a/README.md +++ b/README.md @@ -29,12 +29,13 @@ helm upgrade --install my-deployment k8s-ephemeral-storage-metrics/k8s-ephemeral | dev.enabled | bool | `false` | | | image.imagePullPolicy | string | `"IfNotPresent"` | | | image.repository | string | `"ghcr.io/jmcgrath207/k8s-ephemeral-storage-metrics"` | | -| image.tag | string | `"1.3.1"` | | +| image.tag | string | `"1.4.0"` | | | interval | int | `15` | Polling node rate for exporter | | log_level | string | `"info"` | | | max_node_concurrency | int | `10` | Max number of concurrent query requests to the kubernetes API. | -| metrics | object | `{"adjusted_polling_rate":false,"ephemeral_storage_node_available":true,"ephemeral_storage_node_capacity":true,"ephemeral_storage_node_percentage":true,"ephemeral_storage_pod_usage":true}` | Set metrics you want to enable | +| metrics | object | `{"adjusted_polling_rate":false,"ephemeral_storage_container_limit_percentage":true,"ephemeral_storage_node_available":true,"ephemeral_storage_node_capacity":true,"ephemeral_storage_node_percentage":true,"ephemeral_storage_pod_usage":true}` | Set metrics you want to enable | | metrics.adjusted_polling_rate | bool | `false` | Create the ephemeral_storage_adjusted_polling_rate metrics to report Adjusted Poll Rate in milliseconds. Typically used for testing. | +| metrics.ephemeral_storage_container_limit_percentage | bool | `true` | Percentage of ephemeral storage used by a container in a pod | | metrics.ephemeral_storage_node_available | bool | `true` | Available ephemeral storage for a node | | metrics.ephemeral_storage_node_capacity | bool | `true` | Capacity of ephemeral storage for a node | | metrics.ephemeral_storage_node_percentage | bool | `true` | Percentage of ephemeral storage used on a node | diff --git a/chart/Chart.yaml b/chart/Chart.yaml index 2837b08..2073825 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: k8s-ephemeral-storage-metrics -version: 1.3.1 -appVersion: 1.3.1 +version: 1.4.0 +appVersion: 1.4.0 kubeVersion: ">=1.21.0-0" description: Ephemeral storage metrics for prometheus operator. home: https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics diff --git a/chart/README.md b/chart/README.md index aed5611..1a1d4ab 100644 --- a/chart/README.md +++ b/chart/README.md @@ -14,12 +14,13 @@ helm upgrade --install my-deployment k8s-ephemeral-storage-metrics/k8s-ephemeral | dev.enabled | bool | `false` | | | image.imagePullPolicy | string | `"IfNotPresent"` | | | image.repository | string | `"ghcr.io/jmcgrath207/k8s-ephemeral-storage-metrics"` | | -| image.tag | string | `"1.3.1"` | | +| image.tag | string | `"1.4.0"` | | | interval | int | `15` | Polling node rate for exporter | | log_level | string | `"info"` | | | max_node_concurrency | int | `10` | Max number of concurrent query requests to the kubernetes API. | -| metrics | object | `{"adjusted_polling_rate":false,"ephemeral_storage_node_available":true,"ephemeral_storage_node_capacity":true,"ephemeral_storage_node_percentage":true,"ephemeral_storage_pod_usage":true}` | Set metrics you want to enable | +| metrics | object | `{"adjusted_polling_rate":false,"ephemeral_storage_container_limit_percentage":true,"ephemeral_storage_node_available":true,"ephemeral_storage_node_capacity":true,"ephemeral_storage_node_percentage":true,"ephemeral_storage_pod_usage":true}` | Set metrics you want to enable | | metrics.adjusted_polling_rate | bool | `false` | Create the ephemeral_storage_adjusted_polling_rate metrics to report Adjusted Poll Rate in milliseconds. Typically used for testing. | +| metrics.ephemeral_storage_container_limit_percentage | bool | `true` | Percentage of ephemeral storage used by a container in a pod | | metrics.ephemeral_storage_node_available | bool | `true` | Available ephemeral storage for a node | | metrics.ephemeral_storage_node_capacity | bool | `true` | Capacity of ephemeral storage for a node | | metrics.ephemeral_storage_node_percentage | bool | `true` | Percentage of ephemeral storage used on a node | diff --git a/chart/index.yaml b/chart/index.yaml index 69a11af..db7f3f2 100644 --- a/chart/index.yaml +++ b/chart/index.yaml @@ -1,6 +1,28 @@ apiVersion: v1 entries: k8s-ephemeral-storage-metrics: + - annotations: + artifacthub.io/license: MIT + artifacthub.io/links: | + - name: Documentation + url: https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics + artifacthub.io/prerelease: "false" + apiVersion: v2 + appVersion: 1.4.0 + created: "2023-12-03T19:08:19.344214729-06:00" + description: Ephemeral storage metrics for prometheus operator. + digest: b671c1ba3e95f738d2d8014e4510472701911affdf6787bd0461806cea2475c0 + home: https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics + keywords: + - kubernetes + - metrics + kubeVersion: '>=1.21.0-0' + name: k8s-ephemeral-storage-metrics + sources: + - https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics + urls: + - https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics/releases/download/1.4.0/k8s-ephemeral-storage-metrics-1.4.0.tgz + version: 1.4.0 - annotations: artifacthub.io/license: MIT artifacthub.io/links: | @@ -199,4 +221,4 @@ entries: urls: - https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics/releases/download/1.0.0/k8s-ephemeral-storage-metrics-1.0.0.tgz version: 1.0.0 -generated: "2023-11-27T17:18:17.45365655-06:00" +generated: "2023-12-03T19:08:19.343211858-06:00" diff --git a/chart/templates/DeployType.yaml b/chart/templates/DeployType.yaml index b4de6b3..1c5bb53 100644 --- a/chart/templates/DeployType.yaml +++ b/chart/templates/DeployType.yaml @@ -81,6 +81,10 @@ spec: - name: EPHEMERAL_STORAGE_NODE_PERCENTAGE value: "{{ .Values.metrics.ephemeral_storage_node_percentage }}" {{- end }} + {{- if .Values.metrics.ephemeral_storage_node_percentage }} + - name: EPHEMERAL_STORAGE_CONTAINER_LIMIT_PERCENTAGE + value: "{{ .Values.metrics.ephemeral_storage_container_limit_percentage }}" + {{- end }} {{- if .Values.metrics.adjusted_polling_rate }} - name: ADJUSTED_POLLING_RATE value: "{{ .Values.metrics.adjusted_polling_rate }}" diff --git a/chart/templates/RBAC.yaml b/chart/templates/RBAC.yaml index 898843a..d1ce9c6 100644 --- a/chart/templates/RBAC.yaml +++ b/chart/templates/RBAC.yaml @@ -7,8 +7,8 @@ metadata: {{- include "chart.labels" . | nindent 4 }} rules: - apiGroups: [""] - resources: ["nodes","nodes/proxy"] - verbs: ["get","list"] + resources: ["nodes","nodes/proxy", "pods"] + verbs: ["get","list", "watch"] --- diff --git a/chart/templates/test_deployments.yaml b/chart/templates/test_deployments.yaml index e0a70f8..3750c0e 100644 --- a/chart/templates/test_deployments.yaml +++ b/chart/templates/test_deployments.yaml @@ -18,6 +18,11 @@ spec: - image: local.io/local/shrink-test:latest imagePullPolicy: Never name: shrink-test + resources: + requests: + ephemeral-storage: "1Mi" + limits: + ephemeral-storage: "5Mi" --- @@ -40,4 +45,9 @@ spec: - image: local.io/local/grow-test:latest imagePullPolicy: Never name: grow-test + resources: + requests: + ephemeral-storage: "1Mi" + limits: + ephemeral-storage: "5Mi" {{ end }} diff --git a/chart/values.yaml b/chart/values.yaml index 5094a56..06de656 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -1,10 +1,12 @@ image: repository: ghcr.io/jmcgrath207/k8s-ephemeral-storage-metrics - tag: 1.3.1 + tag: 1.4.0 imagePullPolicy: IfNotPresent # -- Set metrics you want to enable metrics: + # -- Percentage of ephemeral storage used by a container in a pod + ephemeral_storage_container_limit_percentage: true # -- Current ephemeral byte usage of pod ephemeral_storage_pod_usage: true # -- Available ephemeral storage for a node diff --git a/main.go b/main.go index b53b87b..3507406 100644 --- a/main.go +++ b/main.go @@ -12,9 +12,12 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/rs/zerolog" "github.com/rs/zerolog/log" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/util/homedir" "net/http" @@ -27,24 +30,28 @@ import ( ) var ( - inCluster string - clientset *kubernetes.Clientset - sampleInterval int64 - sampleIntervalMill int64 - adjustedPollingRate bool - ephemeralStoragePodUsage bool - ephemeralStorageNodeAvailable bool - ephemeralStorageNodeCapacity bool - ephemeralStorageNodePercentage bool - adjustedTimeGaugeVec *prometheus.GaugeVec - deployType string - nodeWaitGroup sync.WaitGroup - podGaugeVec *prometheus.GaugeVec - nodeAvailableGaugeVec *prometheus.GaugeVec - nodeCapacityGaugeVec *prometheus.GaugeVec - nodePercentageGaugeVec *prometheus.GaugeVec - nodeSlice []string - maxNodeConcurrency int + inCluster string + clientset *kubernetes.Clientset + sampleInterval int64 + sampleIntervalMill int64 + adjustedPollingRate bool + ephemeralStoragePodUsage bool + ephemeralStorageNodeAvailable bool + ephemeralStorageNodeCapacity bool + ephemeralStorageNodePercentage bool + ephemeralStorageContainerLimitsPercentage bool + adjustedTimeGaugeVec *prometheus.GaugeVec + deployType string + nodeWaitGroup sync.WaitGroup + podRequestLimitsWaitGroup sync.WaitGroup + podGaugeVec *prometheus.GaugeVec + nodeAvailableGaugeVec *prometheus.GaugeVec + nodeCapacityGaugeVec *prometheus.GaugeVec + nodePercentageGaugeVec *prometheus.GaugeVec + containerPercentageLimitsVec *prometheus.GaugeVec + nodeSlice []string + maxNodeConcurrency int + podResourceLookup map[string]podContainers ) func getEnv(key, fallback string) string { @@ -114,6 +121,104 @@ type ephemeralStorageMetrics struct { } } +type podContainers struct { + containers []container +} + +type container struct { + name string + request float64 + limit float64 +} + +// used for getting request and limits from pod manifests +func getContainerRequestLimits(p v1.Pod) { + matchKey := v1.ResourceName("ephemeral-storage") + containers := []container{} + for _, x := range p.Spec.Containers { + setContainer := container{} + setContainer.name = x.Name + for key, val := range x.Resources.Requests { + if key == matchKey { + setContainer.request = val.AsApproximateFloat64() + } + } + for key, val := range x.Resources.Limits { + if key == matchKey { + setContainer.limit = val.AsApproximateFloat64() + } + } + containers = append(containers, setContainer) + } + + podResourceLookup[p.Name] = podContainers{containers: containers} +} + +func initGetPodsResourceLimits() { + podRequestLimitsWaitGroup.Add(1) + podResourceLookup = make(map[string]podContainers) + // Init Get List of all pods + pods, err := clientset.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{}) + if err != nil { + fmt.Printf("Error getting pods: %v\n", err) + os.Exit(1) + } + + for _, p := range pods.Items { + getContainerRequestLimits(p) + } + podRequestLimitsWaitGroup.Done() + +} + +func podWatchResourceLimits() { + podRequestLimitsWaitGroup.Wait() + stopCh := make(chan struct{}) + defer close(stopCh) + sharedInformerFactory := informers.NewSharedInformerFactory(clientset, 2*time.Second) + podInformer := sharedInformerFactory.Core().V1().Pods().Informer() + + // Define event handlers for Pod events + eventHandler := cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + p := obj.(*v1.Pod) + getContainerRequestLimits(*p) + }, + UpdateFunc: func(oldObj, newObj interface{}) { + p := newObj.(*v1.Pod) + getContainerRequestLimits(*p) + }, + DeleteFunc: func(obj interface{}) { + delete(podResourceLookup, obj.(*v1.Pod).Name) + }, + } + + // Register the event handlers with the informer + _, err := podInformer.AddEventHandler(eventHandler) + if err != nil { + log.Err(err) + os.Exit(1) + } + + // Start the informer to begin watching for Pod events + go sharedInformerFactory.Start(stopCh) + + // Use a ticker to trigger the watcher every 15 seconds + ticker := time.NewTicker(time.Duration(sampleInterval) * time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + log.Debug().Msg("Watching podWatchResourceLimits for Pod events...") + case <-stopCh: + log.Error().Msg("Watcher podWatchResourceLimits stopped.") + os.Exit(1) + } + } + +} + func getNodes() { oldNodeSet := mapset.NewSet[string]() nodeSet := mapset.NewSet[string]() @@ -187,6 +292,76 @@ type CollectMetric struct { labels prometheus.Labels } +func generateLabels(podName string, podNamespace string, nodeName string, usedBytes float64, availableBytes float64, capacityBytes float64) []CollectMetric { + + var labelsList []CollectMetric + + if ephemeralStorageContainerLimitsPercentage { + podResult, ok := podResourceLookup[podName] + if ok { + for _, c := range podResult.containers { + labels := prometheus.Labels{"pod_namespace": podNamespace, + "pod_name": podName, "node_name": nodeName, "container": c.name} + if c.limit != 0 { + // Use Limit from Container + labelsList = append(labelsList, CollectMetric{ + value: (usedBytes / c.limit) * 100.0, + name: "ephemeral_storage_container_limit_percentage", + labels: labels, + }) + } else { + // Default to Node Available Ephemeral Storage + labelsList = append(labelsList, CollectMetric{ + value: (availableBytes / capacityBytes) * 100.0, + name: "ephemeral_storage_container_limit_percentage", + labels: labels, + }) + } + } + } + } + + if ephemeralStoragePodUsage { + labelsList = append(labelsList, CollectMetric{ + value: usedBytes, + name: "ephemeral_storage_pod_usage", + labels: prometheus.Labels{"pod_namespace": podNamespace, + "pod_name": podName, "node_name": nodeName}, + }) + log.Debug().Msg(fmt.Sprintf("pod %s/%s on %s with usedBytes: %f", podNamespace, podName, nodeName, usedBytes)) + } + if ephemeralStorageNodeAvailable { + labelsList = append(labelsList, CollectMetric{ + value: availableBytes, + name: "ephemeral_storage_node_available", + labels: prometheus.Labels{"node_name": nodeName}}, + ) + log.Debug().Msg(fmt.Sprintf("Node: %s availble bytes: %f", nodeName, availableBytes)) + } + + if ephemeralStorageNodeCapacity { + labelsList = append(labelsList, CollectMetric{ + value: capacityBytes, + name: "ephemeral_storage_node_capacity", + labels: prometheus.Labels{"node_name": nodeName}}, + ) + log.Debug().Msg(fmt.Sprintf("Node: %s capacity bytes: %f", nodeName, capacityBytes)) + } + + if ephemeralStorageNodeCapacity { + percentage := (availableBytes / capacityBytes) * 100.0 + labelsList = append(labelsList, CollectMetric{ + value: percentage, + name: "ephemeral_storage_node_percentage", + labels: prometheus.Labels{"node_name": nodeName}}, + ) + log.Debug().Msg(fmt.Sprintf("Node: %s percentage used: %f", nodeName, percentage)) + } + + return labelsList + +} + func setMetrics(node string) { var labelsList []CollectMetric @@ -215,44 +390,8 @@ func setMetrics(node string) { log.Warn().Msg(fmt.Sprintf("pod %s/%s on %s has no metrics on its ephemeral storage usage", podName, podNamespace, nodeName)) continue } - - if ephemeralStoragePodUsage { - labelsList = append(labelsList, CollectMetric{ - value: usedBytes, - name: "ephemeral_storage_pod_usage", - labels: prometheus.Labels{"pod_namespace": podNamespace, - "pod_name": podName, "node_name": nodeName}, - }) - log.Debug().Msg(fmt.Sprintf("pod %s/%s on %s with usedBytes: %f", podNamespace, podName, nodeName, usedBytes)) - } - if ephemeralStorageNodeAvailable { - labelsList = append(labelsList, CollectMetric{ - value: availableBytes, - name: "ephemeral_storage_node_available", - labels: prometheus.Labels{"node_name": nodeName}}, - ) - log.Debug().Msg(fmt.Sprintf("Node: %s availble bytes: %f", nodeName, availableBytes)) - } - - if ephemeralStorageNodeCapacity { - labelsList = append(labelsList, CollectMetric{ - value: capacityBytes, - name: "ephemeral_storage_node_capacity", - labels: prometheus.Labels{"node_name": nodeName}}, - ) - log.Debug().Msg(fmt.Sprintf("Node: %s capacity bytes: %f", nodeName, capacityBytes)) - } - - if ephemeralStorageNodeCapacity { - percentage := (availableBytes / capacityBytes) * 100.0 - labelsList = append(labelsList, CollectMetric{ - value: percentage, - name: "ephemeral_storage_node_percentage", - labels: prometheus.Labels{"node_name": nodeName}}, - ) - log.Debug().Msg(fmt.Sprintf("Node: %s percentage used: %f", nodeName, percentage)) - } - + labelsList = append(labelsList, generateLabels(podName, podNamespace, nodeName, usedBytes, + availableBytes, capacityBytes)...) } // Reset Metrics for this Node name to remove dead pods @@ -263,6 +402,8 @@ func setMetrics(node string) { switch x.name { case "ephemeral_storage_pod_usage": podGaugeVec.With(x.labels).Set(x.value) + case "ephemeral_storage_container_limit_percentage": + containerPercentageLimitsVec.With(x.labels).Set(x.value) case "ephemeral_storage_node_available": nodeAvailableGaugeVec.With(x.labels).Set(x.value) case "ephemeral_storage_node_capacity": @@ -304,6 +445,27 @@ func createMetrics() { } + if ephemeralStorageContainerLimitsPercentage { + containerPercentageLimitsVec = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "ephemeral_storage_container_limit_percentage", + Help: "Percentage of ephemeral storage used by a container in a pod", + }, + []string{ + // name of pod for Ephemeral Storage + "pod_name", + // namespace of pod for Ephemeral Storage + "pod_namespace", + // Name of Node where pod is placed. + "node_name", + // Name of container + "container", + }, + ) + + prometheus.MustRegister(containerPercentageLimitsVec) + + } + if ephemeralStorageNodeAvailable { nodeAvailableGaugeVec = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "ephemeral_storage_node_available", @@ -364,6 +526,9 @@ func createMetrics() { func getMetrics() { nodeWaitGroup.Wait() + if ephemeralStorageContainerLimitsPercentage { + podRequestLimitsWaitGroup.Wait() + } p, _ := ants.NewPoolWithFunc(maxNodeConcurrency, func(node interface{}) { setMetrics(node.(string)) @@ -410,6 +575,7 @@ func main() { ephemeralStorageNodeAvailable, _ = strconv.ParseBool(getEnv("EPHEMERAL_STORAGE_NODE_AVAILABLE", "false")) ephemeralStorageNodeCapacity, _ = strconv.ParseBool(getEnv("EPHEMERAL_STORAGE_NODE_CAPACITY", "false")) ephemeralStorageNodePercentage, _ = strconv.ParseBool(getEnv("EPHEMERAL_STORAGE_NODE_PERCENTAGE", "false")) + ephemeralStorageContainerLimitsPercentage, _ = strconv.ParseBool(getEnv("EPHEMERAL_STORAGE_CONTAINER_LIMIT_PERCENTAGE", "false")) deployType = getEnv("DEPLOY_TYPE", "DaemonSet") sampleInterval, _ = strconv.ParseInt(getEnv("SCRAPE_INTERVAL", "15"), 10, 64) maxNodeConcurrency, _ = strconv.Atoi(getEnv("MAX_NODE_CONCURRENCY", "10")) @@ -418,6 +584,10 @@ func main() { setLogger() getK8sClient() createMetrics() + if ephemeralStorageContainerLimitsPercentage { + go initGetPodsResourceLimits() + go podWatchResourceLimits() + } go getNodes() go getMetrics() if deployType != "Deployment" && deployType != "DaemonSet" { diff --git a/tests/e2e/deployment_test.go b/tests/e2e/deployment_test.go index a482ff5..383b6de 100644 --- a/tests/e2e/deployment_test.go +++ b/tests/e2e/deployment_test.go @@ -107,6 +107,19 @@ func checkPrometheus(checkSlice []string) { } +func WatchContainerPercentage() { + status := 0 + re := regexp.MustCompile(`ephemeral_storage_container_limit_percentage{container="grow-test",node_name="ephemeral-metrics-cluster-worker".+,pod_namespace="ephemeral-metrics"}\s+(.+)`) + output := requestPrometheusString() + match := re.FindAllStringSubmatch(output, -1) + floatValue, _ := strconv.ParseFloat(match[0][1], 64) + if floatValue < 100.0 { + status = 1 + } + gomega.Expect(status).Should(gomega.Equal(1)) + +} + func WatchNodePercentage() { status := 0 re := regexp.MustCompile(`ephemeral_storage_node_percentage\{node_name="ephemeral-metrics-cluster-control-plane"}\s+(.+)`) @@ -195,7 +208,8 @@ var _ = ginkgo.Describe("Test Metrics\n", func() { "ephemeral_storage_node_capacity", "ephemeral_storage_node_percentage", "pod_name=\"k8s-ephemeral-storage", "ephemeral_storage_adjusted_polling_rate", - "node_name=\"ephemeral-metrics-cluster-worker", "node_name=\"ephemeral-metrics-cluster-control-plane") + "node_name=\"ephemeral-metrics-cluster-worker", "node_name=\"ephemeral-metrics-cluster-control-plane", + "ephemeral_storage_container_limit_percentage") checkPrometheus(checkSlice) }) }) @@ -217,6 +231,11 @@ var _ = ginkgo.Describe("Test Metrics\n", func() { WatchNodePercentage() }) }) + ginkgo.Context("Test ephemeral_storage_node_percentage\n", func() { + ginkgo.Specify("\nMake sure percentage is not over 100", func() { + WatchContainerPercentage() + }) + }) }) func TestDeployments(t *testing.T) {