Skip to content

Commit

Permalink
bug: fix node eviction
Browse files Browse the repository at this point in the history
  • Loading branch information
jmcgrath207 committed Mar 25, 2024
1 parent 9a51d51 commit ccd60df
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 110 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
branches:
- 'master'
pull_request:
types: [opened, reopened]
types: [opened, reopened,ready_for_review,synchronize]


jobs:
Expand Down
38 changes: 19 additions & 19 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ var (
ephemeralStorageNodePercentage bool
ephemeralStorageContainerLimitsPercentage bool
ephemeralStorageContainerVolumeLimitsPercentage bool
adjustedTimeGaugeVec *prometheus.GaugeVec
adjustedPollingRateGaugeVec *prometheus.GaugeVec
deployType string
nodeWaitGroup sync.WaitGroup
podDataWaitGroup sync.WaitGroup
Expand Down Expand Up @@ -267,6 +267,7 @@ func podWatch() {
ticker := time.NewTicker(time.Duration(sampleInterval) * time.Second)
defer ticker.Stop()

// TODO: make this more event driven instead of polling
for {
select {
case <-ticker.C:
Expand All @@ -288,8 +289,18 @@ func evictPodFromMetrics(p v1.Pod) {
}
}

func evictNode(node string) {

nodeAvailableGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": node})
nodeCapacityGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": node})
nodePercentageGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": node})
if adjustedPollingRate {
adjustedPollingRateGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": node})
}
log.Info().Msgf("Node %s does not exist. Removed from monitoring", node)
}

func getNodes() {
oldNodeSet := mapset.NewSet[string]()
nodeSet := mapset.NewSet[string]()
nodeWaitGroup.Add(1)
if deployType != "Deployment" {
Expand All @@ -306,24 +317,13 @@ func getNodes() {
nodeSlice = nodeSet.ToSlice()
nodeWaitGroup.Done()

// Poll for new nodes and remove dead ones
// Poll for new nodes
// TODO: make this more event driven instead of polling
for {
oldNodeSet = nodeSet.Clone()
nodeSet.Clear()
nodes, _ := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
for _, node := range nodes.Items {
nodeSet.Add(node.Name)
}
deadNodesSet := nodeSet.Difference(oldNodeSet)

// Evict Metrics where the node doesn't exist anymore.
for _, deadNode := range deadNodesSet.ToSlice() {
nodeAvailableGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": deadNode})
nodeCapacityGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": deadNode})
nodePercentageGaugeVec.DeletePartialMatch(prometheus.Labels{"node_name": deadNode})
log.Info().Msgf("Node %s does not exist. Removing from monitoring", deadNode)
}

nodeSlice = nodeSet.ToSlice()
time.Sleep(1 * time.Minute)
}
Expand Down Expand Up @@ -439,7 +439,7 @@ func setMetrics(node string) {

content, err := queryNode(node)
if err != nil {
log.Warn().Msg(fmt.Sprintf("Could not query node: %s. Skipping..", node))
evictNode(node)
return
}

Expand Down Expand Up @@ -467,7 +467,7 @@ func setMetrics(node string) {
log.Error().Msgf("Node %s: Polling Rate could not keep up. Adjust your Interval to a higher number than %d seconds", nodeName, sampleInterval)
}
if adjustedPollingRate {
adjustedTimeGaugeVec.With(prometheus.Labels{"node_name": nodeName}).Set(float64(adjustTime))
adjustedPollingRateGaugeVec.With(prometheus.Labels{"node_name": nodeName}).Set(float64(adjustTime))
}

}
Expand Down Expand Up @@ -567,7 +567,7 @@ func createMetrics() {
prometheus.MustRegister(nodePercentageGaugeVec)

if adjustedPollingRate {
adjustedTimeGaugeVec = prometheus.NewGaugeVec(prometheus.GaugeOpts{
adjustedPollingRateGaugeVec = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "ephemeral_storage_adjusted_polling_rate",
Help: "AdjustTime polling rate time after a Node API queries in Milliseconds",
},
Expand All @@ -576,7 +576,7 @@ func createMetrics() {
"node_name",
})

prometheus.MustRegister(adjustedTimeGaugeVec)
prometheus.MustRegister(adjustedPollingRateGaugeVec)
}

}
Expand Down
4 changes: 3 additions & 1 deletion scripts/create-minikube.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ c=$(docker ps -q) && [[ $c ]] && docker kill $c
docker network prune -f
minikube start \
--kubernetes-version="${K8S_VERSION}" \
--insecure-registry "10.0.0.0/24"
--insecure-registry "10.0.0.0/24" \
--cpus=2 \
--memory=3900MB
minikube addons enable registry

# Add Service Monitor CRD
Expand Down
72 changes: 47 additions & 25 deletions scripts/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,42 @@ source helpers.sh
function main() {
local image_tag
local dockerfile
local registry
local image
local common_set_values
local common_set_values_arr
local grow_repo_image
local shrink_repo_image
local e2e_values_arr
local external_registry
local internal_registry
local status_code

trap 'trap_func' EXIT ERR

while [ "$(kubectl get pods -n kube-system -l kubernetes.io/minikube-addons=registry -o=jsonpath='{.items[*].status.phase}')" != "Running Running" ]; do
echo "waiting for minikube registry and proxy pod to start. Sleep 10" && sleep 10
# Wait until registry pod come up
while [ "$(kubectl get pods -n kube-system -l actual-registry=true -o=jsonpath='{.items[*].status.phase}')" != "Running" ]; do
echo "Waiting for registry pod to start. Sleep 10" && sleep 10
done

# Wait until registry-proxy pod come up
while [ "$(kubectl get pods -n kube-system -l registry-proxy=true -o=jsonpath='{.items[*].status.phase}')" != "Running" ]; do
echo "Waiting for registry proxy pod to start. Sleep 10" && sleep 10
done

# Use a while loop to repeatedly check the registry endpoint until health
while true; do
# Send a GET request to the endpoint and capture the HTTP status code
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://$(minikube ip):5000/v2/_catalog")

# Check if the status code is 200
if [ "$status_code" -eq 200 ]; then
echo "Registry endpoint is healthy. Status code: $status_code"
break
else
echo "Registry endpoint is not healthy. Status code: $status_code. Retrying..."
sleep 5 # Wait for 5 seconds before retrying
fi
done

# Need both. External to push and internal for pods to pull from registry in cluster
external_registry="$(minikube ip):5000"
internal_registry="$(kubectl get service -n kube-system registry --template='{{.spec.clusterIP}}')"
Expand All @@ -34,20 +57,19 @@ function main() {

grow_repo_image="k8s-ephemeral-storage-grow-test:latest"

docker build --build-arg TARGETOS=linux --build-arg TARGETARCH=amd64 -f ../DockerfileTestGrow \
-t "${external_registry}/${grow_repo_image}" -t "${internal_registry}/${grow_repo_image}" ../.
docker build --build-arg TARGETOS=linux --build-arg TARGETARCH=amd64 -f ../DockerfileTestGrow \
-t "${external_registry}/${grow_repo_image}" -t "${internal_registry}/${grow_repo_image}" ../.

docker save "${external_registry}/${grow_repo_image}" > /tmp/image.tar
${LOCALBIN}/crane push --insecure /tmp/image.tar "${external_registry}/${grow_repo_image}"
docker save "${external_registry}/${grow_repo_image}" >/tmp/image.tar
"${LOCALBIN}/crane" push --insecure /tmp/image.tar "${external_registry}/${grow_repo_image}"
rm /tmp/image.tar


shrink_repo_image="k8s-ephemeral-storage-shrink-test:latest"

docker build --build-arg TARGETOS=linux --build-arg TARGETARCH=amd64 -f ../DockerfileTestShrink \
-t "${external_registry}/${shrink_repo_image}" -t "${internal_registry}/${shrink_repo_image}" ../.
-t "${external_registry}/${shrink_repo_image}" -t "${internal_registry}/${shrink_repo_image}" ../.

docker save "${external_registry}/${shrink_repo_image}" > /tmp/image.tar
docker save "${external_registry}/${shrink_repo_image}" >/tmp/image.tar
${LOCALBIN}/crane push --insecure /tmp/image.tar "${external_registry}/${shrink_repo_image}"
rm /tmp/image.tar

Expand All @@ -62,13 +84,12 @@ function main() {
# Main image
main_repo_image="${DEPLOYMENT_NAME}:${image_tag}"
docker build --build-arg TARGETOS=linux --build-arg TARGETARCH=amd64 -f ../${dockerfile} \
-t "${external_registry}/${main_repo_image}" -t "${internal_registry}/${main_repo_image}" ../.
-t "${external_registry}/${main_repo_image}" -t "${internal_registry}/${main_repo_image}" ../.

docker save "${external_registry}/${main_repo_image}" > /tmp/image.tar
docker save "${external_registry}/${main_repo_image}" >/tmp/image.tar
${LOCALBIN}/crane push --insecure /tmp/image.tar "${external_registry}/${main_repo_image}"
rm /tmp/image.tar


### Install Chart ###

common_set_values_arr=(
Expand Down Expand Up @@ -96,31 +117,32 @@ function main() {
--create-namespace \
--namespace "${DEPLOYMENT_NAME}"


# Patch deploy so minikube image upload works.
if [[ $ENV == "debug" ]]; then
# Disable for Debugging of Delve.
kubectl patch deployments.apps -n "${DEPLOYMENT_NAME}" k8s-ephemeral-storage-metrics -p \
'{ "spec": {"template": { "spec":{"securityContext": null, "containers":[{"name":"metrics", "livenessProbe": null, "readinessProbe": null, "securityContext": null, "command": null, "args": null }]}}}}'
fi

# Kill dangling port forwards if found.
# Main Exporter Port
sudo ss -aK '( dport = :9100 or sport = :9100 )' | true
# Prometheus Port
sudo ss -aK '( dport = :9090 or sport = :9090 )' | true
# Pprof Port
sudo ss -aK '( dport = :6060 or sport = :6060 )' | true

# Start Exporter Port Forward
(
sleep 10
printf "\n\n" && while :; do kubectl port-forward -n $DEPLOYMENT_NAME service/k8s-ephemeral-storage-metrics 9100:9100 || sleep 5; done
printf "\n\n" && while :; do kubectl port-forward -n $DEPLOYMENT_NAME service/k8s-ephemeral-storage-metrics 9100:9100 || kill_main_exporter_port && sleep 5; done
) &

# Wait until main pod comes up
while [ "$(kubectl get pods -n $DEPLOYMENT_NAME -l app.kubernetes.io/name=k8s-ephemeral-storage-metrics -o=jsonpath='{.items[*].status.phase}')" != "Running" ]; do
echo "waiting for k8s-ephemeral-storage-metrics pod to start. Sleep 10" && sleep 10
echo "Waiting for k8s-ephemeral-storage-metrics pod to start. Sleep 10" && sleep 10
done

# Wait until grow-test comes up
while [ "$(kubectl get pods -n $DEPLOYMENT_NAME -l name=grow-test -o=jsonpath='{.items[*].status.phase}')" != "Running" ]; do
echo "Waiting for grow-test pod to start. Sleep 10" && sleep 10
done

# Wait until shrink-test comes up
while [ "$(kubectl get pods -n $DEPLOYMENT_NAME -l name=shrink-test -o=jsonpath='{.items[*].status.phase}')" != "Running" ]; do
echo "Waiting for shrink-test pod to start. Sleep 10" && sleep 10
done

if [[ $ENV == "debug" ]]; then
Expand Down
13 changes: 12 additions & 1 deletion scripts/helpers.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
#!/bin/bash

function kill_main_exporter_port {
# Main Exporter Port
sudo ss -aK '( dport = :9100 or sport = :9100 )' || true
}

function trap_func() {
set +e
{
helm delete $DEPLOYMENT_NAME -n $DEPLOYMENT_NAME
jobs -p | xargs kill -SIGSTOP
jobs -p | xargs kill -9
sudo ss -aK '( dport = :9100 or sport = :9100 )'
# Kill dangling port forwards if found.
kill_main_exporter_port
# Prometheus Port
sudo ss -aK '( dport = :9090 or sport = :9090 )' || true
# Pprof Port
sudo ss -aK '( dport = :6060 or sport = :6060 )' || true
} &> /dev/null
}

Expand Down
Loading

0 comments on commit ccd60df

Please sign in to comment.