From 8a0ec514ae13f3fd9d3a06cc79e1cccce9ee9ba9 Mon Sep 17 00:00:00 2001 From: John Mcgrath Date: Sun, 24 Mar 2024 20:35:24 -0500 Subject: [PATCH] bug: fix node eviction and added scale up and down node test. --- .github/workflows/test.yaml | 2 +- Makefile | 4 +-- README.md | 4 +-- chart/Chart.yaml | 4 +-- chart/README.md | 4 +-- chart/index.yaml | 24 ++++++++++++++- chart/values.yaml | 2 +- main.go | 38 ++++++++++++------------ scripts/create-minikube.sh | 4 ++- scripts/deploy.sh | 42 ++++++++++++-------------- scripts/helpers.sh | 13 +++++++- tests/e2e/deployment_test.go | 57 +++++++++++++++++++++++++++++------- 12 files changed, 132 insertions(+), 66 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5913b6a..80726a4 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -5,7 +5,7 @@ on: branches: - 'master' pull_request: - types: [opened, reopened] + types: [opened, reopened,ready_for_review,synchronize] jobs: diff --git a/Makefile b/Makefile index 9f9b6d2..c6fdc7b 100644 --- a/Makefile +++ b/Makefile @@ -70,10 +70,10 @@ release-helm: cd .. release: release-docker release-helm helm-docs - # ex. make VERSION=1.5.2 release + # ex. make VERSION=1.5.3 release release-github: - # ex. make VERSION=1.5.2 release-github + # ex. make VERSION=1.5.3 release-github gh release create ${VERSION} --generate-notes gh release upload ${VERSION} "chart/k8s-ephemeral-storage-metrics-${VERSION}.tgz" rm chart/k8s-ephemeral-storage-metrics-*.tgz diff --git a/README.md b/README.md index fed9999..b474eca 100644 --- a/README.md +++ b/README.md @@ -29,10 +29,10 @@ helm upgrade --install my-deployment k8s-ephemeral-storage-metrics/k8s-ephemeral |-----|------|---------|-------------| | affinity | object | `{}` | | | deploy_type | string | `"Deployment"` | Set as Deployment for single controller to query all nodes or Daemonset | -| dev | object | `{"enabled":false,"image":{"imagePullPolicy":"IfNotPresent"}}` | For local development of kind and/or deploy grow and shrink test pods | +| dev | object | `{"enabled":false,"grow":{"image":"ghcr.io/jmcgrath207/k8s-ephemeral-storage-grow-test:latest","imagePullPolicy":"IfNotPresent"},"shrink":{"image":"ghcr.io/jmcgrath207/k8s-ephemeral-storage-shrink-test:latest","imagePullPolicy":"IfNotPresent"}}` | For local development or testing that will deploy grow and shrink pods and debug service | | image.imagePullPolicy | string | `"IfNotPresent"` | | | image.repository | string | `"ghcr.io/jmcgrath207/k8s-ephemeral-storage-metrics"` | | -| image.tag | string | `"1.5.2"` | | +| image.tag | string | `"1.5.3"` | | | interval | int | `15` | Polling node rate for exporter | | log_level | string | `"info"` | | | max_node_concurrency | int | `10` | Max number of concurrent query requests to the kubernetes API. | diff --git a/chart/Chart.yaml b/chart/Chart.yaml index bd06e0d..082bfc4 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: k8s-ephemeral-storage-metrics -version: 1.5.2 -appVersion: 1.5.2 +version: 1.5.3 +appVersion: 1.5.3 kubeVersion: ">=1.21.0-0" description: Ephemeral storage metrics for prometheus operator. home: https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics diff --git a/chart/README.md b/chart/README.md index b979245..316b30c 100644 --- a/chart/README.md +++ b/chart/README.md @@ -12,10 +12,10 @@ helm upgrade --install my-deployment k8s-ephemeral-storage-metrics/k8s-ephemeral |-----|------|---------|-------------| | affinity | object | `{}` | | | deploy_type | string | `"Deployment"` | Set as Deployment for single controller to query all nodes or Daemonset | -| dev | object | `{"enabled":false,"image":{"imagePullPolicy":"IfNotPresent"}}` | For local development of kind and/or deploy grow and shrink test pods | +| dev | object | `{"enabled":false,"grow":{"image":"ghcr.io/jmcgrath207/k8s-ephemeral-storage-grow-test:latest","imagePullPolicy":"IfNotPresent"},"shrink":{"image":"ghcr.io/jmcgrath207/k8s-ephemeral-storage-shrink-test:latest","imagePullPolicy":"IfNotPresent"}}` | For local development or testing that will deploy grow and shrink pods and debug service | | image.imagePullPolicy | string | `"IfNotPresent"` | | | image.repository | string | `"ghcr.io/jmcgrath207/k8s-ephemeral-storage-metrics"` | | -| image.tag | string | `"1.5.2"` | | +| image.tag | string | `"1.5.3"` | | | interval | int | `15` | Polling node rate for exporter | | log_level | string | `"info"` | | | max_node_concurrency | int | `10` | Max number of concurrent query requests to the kubernetes API. | diff --git a/chart/index.yaml b/chart/index.yaml index 942da12..879a8ae 100644 --- a/chart/index.yaml +++ b/chart/index.yaml @@ -1,6 +1,28 @@ apiVersion: v1 entries: k8s-ephemeral-storage-metrics: + - annotations: + artifacthub.io/license: MIT + artifacthub.io/links: | + - name: Documentation + url: https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics + artifacthub.io/prerelease: "false" + apiVersion: v2 + appVersion: 1.5.3 + created: "2024-03-24T20:33:20.402818248-05:00" + description: Ephemeral storage metrics for prometheus operator. + digest: 0a7eb718c66bc9e8feb185a4bce62bad54e1cac84919e7836a09558d22e3a4a3 + home: https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics + keywords: + - kubernetes + - metrics + kubeVersion: '>=1.21.0-0' + name: k8s-ephemeral-storage-metrics + sources: + - https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics + urls: + - https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics/releases/download/1.5.3/k8s-ephemeral-storage-metrics-1.5.3.tgz + version: 1.5.3 - annotations: artifacthub.io/license: MIT artifacthub.io/links: | @@ -375,4 +397,4 @@ entries: urls: - https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics/releases/download/1.0.0/k8s-ephemeral-storage-metrics-1.0.0.tgz version: 1.0.0 -generated: "2024-02-05T23:04:04.581928343-06:00" +generated: "2024-03-24T20:33:20.40194353-05:00" diff --git a/chart/values.yaml b/chart/values.yaml index 1803877..e991777 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -1,6 +1,6 @@ image: repository: ghcr.io/jmcgrath207/k8s-ephemeral-storage-metrics - tag: 1.5.2 + tag: 1.5.3 imagePullPolicy: IfNotPresent # -- Set metrics you want to enable diff --git a/main.go b/main.go index 068e834..6eda5f9 100644 --- a/main.go +++ b/main.go @@ -42,7 +42,7 @@ var ( ephemeralStorageNodePercentage bool ephemeralStorageContainerLimitsPercentage bool ephemeralStorageContainerVolumeLimitsPercentage bool - adjustedTimeGaugeVec *prometheus.GaugeVec + adjustedPollingRateGaugeVec *prometheus.GaugeVec deployType string nodeWaitGroup sync.WaitGroup podDataWaitGroup sync.WaitGroup @@ -267,6 +267,7 @@ func podWatch() { ticker := time.NewTicker(time.Duration(sampleInterval) * time.Second) defer ticker.Stop() + // TODO: make this more event driven instead of polling for { select { case <-ticker.C: @@ -288,8 +289,18 @@ func evictPodFromMetrics(p v1.Pod) { } } +func evictNode(node string) { + + nodeAvailableGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": node}) + nodeCapacityGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": node}) + nodePercentageGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": node}) + if adjustedPollingRate { + adjustedPollingRateGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": node}) + } + log.Info().Msgf("Node %s does not exist. Removed from monitoring", node) +} + func getNodes() { - oldNodeSet := mapset.NewSet[string]() nodeSet := mapset.NewSet[string]() nodeWaitGroup.Add(1) if deployType != "Deployment" { @@ -306,24 +317,13 @@ func getNodes() { nodeSlice = nodeSet.ToSlice() nodeWaitGroup.Done() - // Poll for new nodes and remove dead ones + // Poll for new nodes + // TODO: make this more event driven instead of polling for { - oldNodeSet = nodeSet.Clone() - nodeSet.Clear() nodes, _ := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) for _, node := range nodes.Items { nodeSet.Add(node.Name) } - deadNodesSet := nodeSet.Difference(oldNodeSet) - - // Evict Metrics where the node doesn't exist anymore. - for _, deadNode := range deadNodesSet.ToSlice() { - nodeAvailableGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": deadNode}) - nodeCapacityGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": deadNode}) - nodePercentageGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": deadNode}) - log.Info().Msgf("Node %s does not exist. Removing from monitoring", deadNode) - } - nodeSlice = nodeSet.ToSlice() time.Sleep(1 * time.Minute) } @@ -439,7 +439,7 @@ func setMetrics(node string) { content, err := queryNode(node) if err != nil { - log.Warn().Msg(fmt.Sprintf("Could not query node: %s. Skipping..", node)) + evictNode(node) return } @@ -467,7 +467,7 @@ func setMetrics(node string) { log.Error().Msgf("Node %s: Polling Rate could not keep up. Adjust your Interval to a higher number than %d seconds", nodeName, sampleInterval) } if adjustedPollingRate { - adjustedTimeGaugeVec.With(prometheus.Labels{"node_name": nodeName}).Set(float64(adjustTime)) + adjustedPollingRateGaugeVec.With(prometheus.Labels{"node_name": nodeName}).Set(float64(adjustTime)) } } @@ -567,7 +567,7 @@ func createMetrics() { prometheus.MustRegister(nodePercentageGaugeVec) if adjustedPollingRate { - adjustedTimeGaugeVec = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + adjustedPollingRateGaugeVec = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "ephemeral_storage_adjusted_polling_rate", Help: "AdjustTime polling rate time after a Node API queries in Milliseconds", }, @@ -576,7 +576,7 @@ func createMetrics() { "node_name", }) - prometheus.MustRegister(adjustedTimeGaugeVec) + prometheus.MustRegister(adjustedPollingRateGaugeVec) } } diff --git a/scripts/create-minikube.sh b/scripts/create-minikube.sh index 3b211aa..fbfaf41 100755 --- a/scripts/create-minikube.sh +++ b/scripts/create-minikube.sh @@ -5,7 +5,9 @@ c=$(docker ps -q) && [[ $c ]] && docker kill $c docker network prune -f minikube start \ --kubernetes-version="${K8S_VERSION}" \ - --insecure-registry "10.0.0.0/24" + --insecure-registry "10.0.0.0/24" \ + --cpus=2 \ + --memory=3900MB minikube addons enable registry # Add Service Monitor CRD diff --git a/scripts/deploy.sh b/scripts/deploy.sh index 91be204..a5e2110 100755 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -13,19 +13,24 @@ source helpers.sh function main() { local image_tag local dockerfile - local registry - local image local common_set_values local common_set_values_arr local grow_repo_image local shrink_repo_image local e2e_values_arr + local external_registry + local internal_registry trap 'trap_func' EXIT ERR - while [ "$(kubectl get pods -n kube-system -l kubernetes.io/minikube-addons=registry -o=jsonpath='{.items[*].status.phase}')" != "Running Running" ]; do - echo "waiting for minikube registry and proxy pod to start. Sleep 10" && sleep 10 + while [ "$(kubectl get pods -n kube-system -l actual-registry=true -o=jsonpath='{.items[*].status.phase}')" != "Running" ]; do + echo "Waiting for registry pod to start. Sleep 10" && sleep 10 done + + while [ "$(kubectl get pods -n kube-system -l registry-proxy=true -o=jsonpath='{.items[*].status.phase}')" != "Running" ]; do + echo "Waiting for registry proxy pod to start. Sleep 10" && sleep 10 + done + # Need both. External to push and internal for pods to pull from registry in cluster external_registry="$(minikube ip):5000" internal_registry="$(kubectl get service -n kube-system registry --template='{{.spec.clusterIP}}')" @@ -34,20 +39,19 @@ function main() { grow_repo_image="k8s-ephemeral-storage-grow-test:latest" - docker build --build-arg TARGETOS=linux --build-arg TARGETARCH=amd64 -f ../DockerfileTestGrow \ - -t "${external_registry}/${grow_repo_image}" -t "${internal_registry}/${grow_repo_image}" ../. + docker build --build-arg TARGETOS=linux --build-arg TARGETARCH=amd64 -f ../DockerfileTestGrow \ + -t "${external_registry}/${grow_repo_image}" -t "${internal_registry}/${grow_repo_image}" ../. - docker save "${external_registry}/${grow_repo_image}" > /tmp/image.tar + docker save "${external_registry}/${grow_repo_image}" >/tmp/image.tar ${LOCALBIN}/crane push --insecure /tmp/image.tar "${external_registry}/${grow_repo_image}" rm /tmp/image.tar - shrink_repo_image="k8s-ephemeral-storage-shrink-test:latest" docker build --build-arg TARGETOS=linux --build-arg TARGETARCH=amd64 -f ../DockerfileTestShrink \ - -t "${external_registry}/${shrink_repo_image}" -t "${internal_registry}/${shrink_repo_image}" ../. + -t "${external_registry}/${shrink_repo_image}" -t "${internal_registry}/${shrink_repo_image}" ../. - docker save "${external_registry}/${shrink_repo_image}" > /tmp/image.tar + docker save "${external_registry}/${shrink_repo_image}" >/tmp/image.tar ${LOCALBIN}/crane push --insecure /tmp/image.tar "${external_registry}/${shrink_repo_image}" rm /tmp/image.tar @@ -62,13 +66,12 @@ function main() { # Main image main_repo_image="${DEPLOYMENT_NAME}:${image_tag}" docker build --build-arg TARGETOS=linux --build-arg TARGETARCH=amd64 -f ../${dockerfile} \ - -t "${external_registry}/${main_repo_image}" -t "${internal_registry}/${main_repo_image}" ../. + -t "${external_registry}/${main_repo_image}" -t "${internal_registry}/${main_repo_image}" ../. - docker save "${external_registry}/${main_repo_image}" > /tmp/image.tar + docker save "${external_registry}/${main_repo_image}" >/tmp/image.tar ${LOCALBIN}/crane push --insecure /tmp/image.tar "${external_registry}/${main_repo_image}" rm /tmp/image.tar - ### Install Chart ### common_set_values_arr=( @@ -96,7 +99,6 @@ function main() { --create-namespace \ --namespace "${DEPLOYMENT_NAME}" - # Patch deploy so minikube image upload works. if [[ $ENV == "debug" ]]; then # Disable for Debugging of Delve. @@ -104,23 +106,15 @@ function main() { '{ "spec": {"template": { "spec":{"securityContext": null, "containers":[{"name":"metrics", "livenessProbe": null, "readinessProbe": null, "securityContext": null, "command": null, "args": null }]}}}}' fi - # Kill dangling port forwards if found. - # Main Exporter Port - sudo ss -aK '( dport = :9100 or sport = :9100 )' | true - # Prometheus Port - sudo ss -aK '( dport = :9090 or sport = :9090 )' | true - # Pprof Port - sudo ss -aK '( dport = :6060 or sport = :6060 )' | true - # Start Exporter Port Forward ( sleep 10 - printf "\n\n" && while :; do kubectl port-forward -n $DEPLOYMENT_NAME service/k8s-ephemeral-storage-metrics 9100:9100 || sleep 5; done + printf "\n\n" && while :; do kubectl port-forward -n $DEPLOYMENT_NAME service/k8s-ephemeral-storage-metrics 9100:9100 || kill_main_exporter_port && sleep 5; done ) & # Wait until main pod comes up while [ "$(kubectl get pods -n $DEPLOYMENT_NAME -l app.kubernetes.io/name=k8s-ephemeral-storage-metrics -o=jsonpath='{.items[*].status.phase}')" != "Running" ]; do - echo "waiting for k8s-ephemeral-storage-metrics pod to start. Sleep 10" && sleep 10 + echo "Waiting for k8s-ephemeral-storage-metrics pod to start. Sleep 10" && sleep 10 done if [[ $ENV == "debug" ]]; then diff --git a/scripts/helpers.sh b/scripts/helpers.sh index 19286ca..7a713b4 100755 --- a/scripts/helpers.sh +++ b/scripts/helpers.sh @@ -1,3 +1,9 @@ +#!/bin/bash + +function kill_main_exporter_port { + # Main Exporter Port + sudo ss -aK '( dport = :9100 or sport = :9100 )' || true +} function trap_func() { set +e @@ -5,7 +11,12 @@ function trap_func() { helm delete $DEPLOYMENT_NAME -n $DEPLOYMENT_NAME jobs -p | xargs kill -SIGSTOP jobs -p | xargs kill -9 - sudo ss -aK '( dport = :9100 or sport = :9100 )' + # Kill dangling port forwards if found. + kill_main_exporter_port + # Prometheus Port + sudo ss -aK '( dport = :9090 or sport = :9090 )' || true + # Pprof Port + sudo ss -aK '( dport = :6060 or sport = :6060 )' || true } &> /dev/null } diff --git a/tests/e2e/deployment_test.go b/tests/e2e/deployment_test.go index 2cf9fc7..e3598ec 100644 --- a/tests/e2e/deployment_test.go +++ b/tests/e2e/deployment_test.go @@ -6,6 +6,7 @@ import ( "github.com/onsi/gomega" "io" "net/http" + "os/exec" "regexp" "strconv" "strings" @@ -58,7 +59,7 @@ func CheckValues(ifFound map[string]bool) int { return status } -func checkPrometheus(checkSlice []string) { +func checkPrometheus(checkSlice []string, inverse bool) { var status int timeout := time.Second * 180 startTime := time.Now() @@ -84,10 +85,14 @@ func checkPrometheus(checkSlice []string) { if ifFound[a] { continue } - if strings.Contains(output, a) { + if inverse && !strings.Contains(output, a) { + ifFound[a] = true + + } else if !inverse && strings.Contains(output, a) { ifFound[a] = true - status = CheckValues(ifFound) } + + status = CheckValues(ifFound) } if status == 1 { @@ -97,7 +102,11 @@ func checkPrometheus(checkSlice []string) { } for key, value := range ifFound { if value { - ginkgo.GinkgoWriter.Printf("\nFound value: [ %v ] in prometheus exporter\n", key) + if inverse { + ginkgo.GinkgoWriter.Printf("\nDid not find value: [ %v ] in prometheus exporter\n", key) + } else { + ginkgo.GinkgoWriter.Printf("\nFound value: [ %v ] in prometheus exporter\n", key) + } continue } ginkgo.GinkgoWriter.Printf("\nDid not find value: [ %v ] in prometheus exporter\n", key) @@ -120,16 +129,12 @@ func WatchContainerPercentage() { } +// TODO: need to add func WatchContainerVolumePercentage() { - // TODO: add test for this once evictions for dead pods is handled. //ephemeral_storage_container_volume_limit_percentage } -func WatchDeadPod() { - // TODO: Deploy a pod and then remove it. Make sure it's completely evicted from prom metrics. -} - func WatchNodePercentage() { status := 0 re := regexp.MustCompile(`ephemeral_storage_node_percentage\{node_name="minikube"}\s+(.+)`) @@ -208,6 +213,24 @@ func WatchEphemeralPodSize(podname string, sizeChange float64, timeout time.Dura } +func scaleUp() { + cmd := exec.Command("make", "minikube_scale_up") + cmd.Dir = "../.." + + _, err := cmd.Output() + gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) + +} + +func scaleDown() { + cmd := exec.Command("make", "minikube_scale_down") + cmd.Dir = "../.." + + _, err := cmd.Output() + gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) + +} + var _ = ginkgo.Describe("Test Metrics\n", func() { ginkgo.Context("Observe labels\n", func() { @@ -220,7 +243,7 @@ var _ = ginkgo.Describe("Test Metrics\n", func() { "pod_name=\"k8s-ephemeral-storage", "ephemeral_storage_adjusted_polling_rate", "node_name=\"minikube", "ephemeral_storage_container_limit_percentage") - checkPrometheus(checkSlice) + checkPrometheus(checkSlice, false) }) }) ginkgo.Context("Observe change in storage metrics\n", func() { @@ -246,6 +269,20 @@ var _ = ginkgo.Describe("Test Metrics\n", func() { WatchContainerPercentage() }) }) + ginkgo.Context("Test Scale up\n", func() { + checkSlice := []string{ + "node_name=\"minikube-m02", + "ephemeral_storage_container_limit_percentage{container=\"kube-proxy\",node_name=\"minikube-m02\"", + } + ginkgo.Specify("\nScale up test to make sure pods and nodes are found", func() { + scaleUp() + checkPrometheus(checkSlice, false) + }) + ginkgo.Specify("\nScale Down test to make sure pods and nodes are evicted", func() { + scaleDown() + checkPrometheus(checkSlice, true) + }) + }) }) func TestDeployments(t *testing.T) {