From e799e8d6b29bab6706ccb6314de2051346c0e1c1 Mon Sep 17 00:00:00 2001 From: "g.beausire" Date: Tue, 18 Jun 2024 16:29:34 +0200 Subject: [PATCH] Add support for freeze in budgets Provide a new API to freeze budgets and metrics to track its usage as API --- DOC.md | 82 ++++++++++ Makefile | 2 +- README.md | 6 +- .../applicationdisruptionbudget_types.go | 10 ++ api/v1alpha1/nodedisruptionbudget_types.go | 2 + api/v1alpha1/zz_generated.deepcopy.go | 17 +++ .../applicationdisruptionbudget-crd.yaml | 11 ++ chart/templates/nodedisruptionbudget-crd.yaml | 11 ++ ...iteo.com_applicationdisruptionbudgets.yaml | 11 ++ ...tion.criteo.com_nodedisruptionbudgets.yaml | 11 ++ .../applicationdisruptionbudget_controller.go | 14 ++ internal/controller/metrics.go | 7 + .../nodedisruption_controller_test.go | 141 ++++++++++++++++++ .../nodedisruptionbudget_controller.go | 13 ++ 14 files changed, 336 insertions(+), 2 deletions(-) diff --git a/DOC.md b/DOC.md index fc56e71..da1cc45 100644 --- a/DOC.md +++ b/DOC.md @@ -94,6 +94,13 @@ ApplicationDisruptionBudgetSpec defines the desired state of ApplicationDisrupti A NodeDisruption is allowed if at most "maxDisruptions" nodes selected by selectors are unavailable after the disruption.
true + + freeze + object + + Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints
+ + false healthHook object @@ -121,6 +128,40 @@ Maintenance will proceed only if the endpoint responds 2XX.
+### ApplicationDisruptionBudget.spec.freeze +[↩ Parent](#applicationdisruptionbudgetspec) + + + +Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
enabledboolean + Freeze the budget to prevent any disruptions
+
false
reasonstring + Reason of the freeze
+
false
+ + ### ApplicationDisruptionBudget.spec.healthHook [↩ Parent](#applicationdisruptionbudgetspec) @@ -492,6 +533,13 @@ NodeDisruptionBudgetSpec defines the desired state of NodeDisruptionBudget A NodeDisruption is allowed if at most "minUndisruptedNodes" nodes selected by selectors are unavailable after the disruption.
true + + freeze + object + + Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints
+ + false nodeSelector object @@ -503,6 +551,40 @@ NodeDisruptionBudgetSpec defines the desired state of NodeDisruptionBudget +### NodeDisruptionBudget.spec.freeze +[↩ Parent](#nodedisruptionbudgetspec) + + + +Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
enabledboolean + Freeze the budget to prevent any disruptions
+
false
reasonstring + Reason of the freeze
+
false
+ + ### NodeDisruptionBudget.spec.nodeSelector [↩ Parent](#nodedisruptionbudgetspec) diff --git a/Makefile b/Makefile index 5b6ce47..8181796 100644 --- a/Makefile +++ b/Makefile @@ -75,7 +75,7 @@ test: manifests generate fmt vet envtest lint ## Run tests. ##@ Build .PHONY: build -build: manifests generate fmt vet ## Build manager binary. +build: manifests generate fmt vet gen-doc ## Build manager binary. CGO_ENABLED=0 go build -o bin/manager cmd/main.go .PHONY: run diff --git a/README.md b/README.md index cdd010a..31881d0 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,6 @@ In some cases, an application can be unhealthy even if all its pods are running. You can select Pods and/or PVCs. - ##### PVC selector The main reason of using a PVC selector is to ensure that node that contains data don't enter maintenance @@ -182,6 +181,11 @@ The hook will be called with a POST method containing the JSON encoded NodeDisru Note: It is not a replacement for readiness probes but a complement. +##### Freeze + +Budgets support freezing disruptions. By setting `spec.Freeze.Enabled`, the budget will reject all disruptions and give the reason specified in `spec.Freeze.Reason`. +It is equivalent to setting 0 as the max disruptions but it provide better messages. + #### Sample object ```yaml diff --git a/api/v1alpha1/applicationdisruptionbudget_types.go b/api/v1alpha1/applicationdisruptionbudget_types.go index 6d1a740..34bd40e 100644 --- a/api/v1alpha1/applicationdisruptionbudget_types.go +++ b/api/v1alpha1/applicationdisruptionbudget_types.go @@ -43,6 +43,16 @@ type ApplicationDisruptionBudgetSpec struct { // Maintenance will proceed only if the endpoint responds 2XX. // +kubebuilder:validation:Optional HealthHook HealthHookSpec `json:"healthHook,omitempty"` + // Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints + Freeze FreezeSpec `json:"freeze,omitempty"` +} + +// FreezeSpec defines the freeze status of the budget +type FreezeSpec struct { + // Freeze the budget to prevent any disruptions + Enabled bool `json:"enabled,omitempty"` + // Reason of the freeze + Reason string `json:"reason,omitempty"` } type HealthHookSpec struct { diff --git a/api/v1alpha1/nodedisruptionbudget_types.go b/api/v1alpha1/nodedisruptionbudget_types.go index a22da6c..df1df78 100644 --- a/api/v1alpha1/nodedisruptionbudget_types.go +++ b/api/v1alpha1/nodedisruptionbudget_types.go @@ -37,6 +37,8 @@ type NodeDisruptionBudgetSpec struct { MinUndisruptedNodes int `json:"minUndisruptedNodes"` // NodeSelector query over pods whose nodes are managed by the disruption budget. NodeSelector metav1.LabelSelector `json:"nodeSelector,omitempty"` + // Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints + Freeze FreezeSpec `json:"freeze,omitempty"` } //+kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 8ffcaf9..04327a4 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -89,6 +89,7 @@ func (in *ApplicationDisruptionBudgetSpec) DeepCopyInto(out *ApplicationDisrupti in.PodSelector.DeepCopyInto(&out.PodSelector) in.PVCSelector.DeepCopyInto(&out.PVCSelector) out.HealthHook = in.HealthHook + out.Freeze = in.Freeze } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApplicationDisruptionBudgetSpec. @@ -157,6 +158,21 @@ func (in *DisruptionBudgetStatus) DeepCopy() *DisruptionBudgetStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FreezeSpec) DeepCopyInto(out *FreezeSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FreezeSpec. +func (in *FreezeSpec) DeepCopy() *FreezeSpec { + if in == nil { + return nil + } + out := new(FreezeSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *HealthHookSpec) DeepCopyInto(out *HealthHookSpec) { *out = *in @@ -277,6 +293,7 @@ func (in *NodeDisruptionBudgetList) DeepCopyObject() runtime.Object { func (in *NodeDisruptionBudgetSpec) DeepCopyInto(out *NodeDisruptionBudgetSpec) { *out = *in in.NodeSelector.DeepCopyInto(&out.NodeSelector) + out.Freeze = in.Freeze } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeDisruptionBudgetSpec. diff --git a/chart/templates/applicationdisruptionbudget-crd.yaml b/chart/templates/applicationdisruptionbudget-crd.yaml index 83d2c9d..724eda5 100644 --- a/chart/templates/applicationdisruptionbudget-crd.yaml +++ b/chart/templates/applicationdisruptionbudget-crd.yaml @@ -54,6 +54,17 @@ spec: description: ApplicationDisruptionBudgetSpec defines the desired state of ApplicationDisruptionBudget properties: + freeze: + description: Define the freeze status of the budget. Frozen budget reject + all disruptions ignoring any other constraints + properties: + enabled: + description: Freeze the budget to prevent any disruptions + type: boolean + reason: + description: Reason of the freeze + type: string + type: object healthHook: description: |- Define a optional hook to call when validating a NodeDisruption. diff --git a/chart/templates/nodedisruptionbudget-crd.yaml b/chart/templates/nodedisruptionbudget-crd.yaml index cb9612f..5306384 100644 --- a/chart/templates/nodedisruptionbudget-crd.yaml +++ b/chart/templates/nodedisruptionbudget-crd.yaml @@ -56,6 +56,17 @@ spec: spec: description: NodeDisruptionBudgetSpec defines the desired state of NodeDisruptionBudget properties: + freeze: + description: Define the freeze status of the budget. Frozen budget reject + all disruptions ignoring any other constraints + properties: + enabled: + description: Freeze the budget to prevent any disruptions + type: boolean + reason: + description: Reason of the freeze + type: string + type: object maxDisruptedNodes: description: A NodeDisruption is allowed if at most "maxDisruptedNodes" nodes selected by selectors are unavailable after the disruption. diff --git a/config/crd/bases/nodedisruption.criteo.com_applicationdisruptionbudgets.yaml b/config/crd/bases/nodedisruption.criteo.com_applicationdisruptionbudgets.yaml index bd2f461..dbbaf7c 100644 --- a/config/crd/bases/nodedisruption.criteo.com_applicationdisruptionbudgets.yaml +++ b/config/crd/bases/nodedisruption.criteo.com_applicationdisruptionbudgets.yaml @@ -53,6 +53,17 @@ spec: description: ApplicationDisruptionBudgetSpec defines the desired state of ApplicationDisruptionBudget properties: + freeze: + description: Define the freeze status of the budget. Frozen budget + reject all disruptions ignoring any other constraints + properties: + enabled: + description: Freeze the budget to prevent any disruptions + type: boolean + reason: + description: Reason of the freeze + type: string + type: object healthHook: description: |- Define a optional hook to call when validating a NodeDisruption. diff --git a/config/crd/bases/nodedisruption.criteo.com_nodedisruptionbudgets.yaml b/config/crd/bases/nodedisruption.criteo.com_nodedisruptionbudgets.yaml index b418220..d804c59 100644 --- a/config/crd/bases/nodedisruption.criteo.com_nodedisruptionbudgets.yaml +++ b/config/crd/bases/nodedisruption.criteo.com_nodedisruptionbudgets.yaml @@ -55,6 +55,17 @@ spec: spec: description: NodeDisruptionBudgetSpec defines the desired state of NodeDisruptionBudget properties: + freeze: + description: Define the freeze status of the budget. Frozen budget + reject all disruptions ignoring any other constraints + properties: + enabled: + description: Freeze the budget to prevent any disruptions + type: boolean + reason: + description: Reason of the freeze + type: string + type: object maxDisruptedNodes: description: A NodeDisruption is allowed if at most "maxDisruptedNodes" nodes selected by selectors are unavailable after the disruption. diff --git a/internal/controller/applicationdisruptionbudget_controller.go b/internal/controller/applicationdisruptionbudget_controller.go index 661e015..bf64e61 100644 --- a/internal/controller/applicationdisruptionbudget_controller.go +++ b/internal/controller/applicationdisruptionbudget_controller.go @@ -115,6 +115,12 @@ func PruneADBMetrics(ref nodedisruptionv1alpha1.NamespacedName) { // UpdateADBMetrics update metrics for an ADB func UpdateADBMetrics(ref nodedisruptionv1alpha1.NamespacedName, adb *nodedisruptionv1alpha1.ApplicationDisruptionBudget) { DisruptionBudgetMaxDisruptions.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(adb.Spec.MaxDisruptions)) + if adb.Spec.Freeze.Enabled { + DisruptionBudgetFrozen.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(1) + } else { + DisruptionBudgetFrozen.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(0) + } + UpdateBudgetStatusMetrics(ref, adb.Status) } @@ -200,6 +206,14 @@ func (r *ApplicationDisruptionBudgetResolver) IsImpacted(disruptedNodes resolver // Return the number of disruption allowed considering a list of current node disruptions func (r *ApplicationDisruptionBudgetResolver) TryValidateDisruptionFromBudgetConstraints(_ resolver.NodeSet) nodedisruptionv1alpha1.DisruptedBudgetStatus { + if r.ApplicationDisruptionBudget.Spec.Freeze.Enabled { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: r.GetNamespacedName(), + Reason: fmt.Sprintf("Budget frozen: %s", r.ApplicationDisruptionBudget.Spec.Freeze.Reason), + Ok: false, + } + } + if r.ApplicationDisruptionBudget.Status.DisruptionsAllowed-1 >= 0 { return nodedisruptionv1alpha1.DisruptedBudgetStatus{ Reference: r.GetNamespacedName(), diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go index 80597be..cddd879 100644 --- a/internal/controller/metrics.go +++ b/internal/controller/metrics.go @@ -139,4 +139,11 @@ var ( }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "node_disruption_name"}, ) + DisruptionBudgetFrozen = promauto.With(metrics.Registry).NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "budget_disruption_frozen", + Help: "Budget frozen", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + ) ) diff --git a/internal/controller/nodedisruption_controller_test.go b/internal/controller/nodedisruption_controller_test.go index 92b66e6..5ea79a0 100644 --- a/internal/controller/nodedisruption_controller_test.go +++ b/internal/controller/nodedisruption_controller_test.go @@ -372,6 +372,147 @@ var _ = Describe("NodeDisruption controller", func() { }) }) + When("there is a node disruption budget that is frozen", func() { + It("rejects the node disruption", func() { + By("creating a budget that accepts everything") + ndb := &nodedisruptionv1alpha1.NodeDisruptionBudget{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "nodedisruption.criteo.com/v1alpha1", + Kind: "NodeDisruptionBudget", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: nodedisruptionv1alpha1.NodeDisruptionBudgetSpec{ + NodeSelector: metav1.LabelSelector{MatchLabels: nodeLabels1}, + MaxDisruptedNodes: 99, + Freeze: nodedisruptionv1alpha1.FreezeSpec{ + Enabled: false, + }, + }, + } + Expect(k8sClient.Create(ctx, ndb.DeepCopy())).Should(Succeed()) + + By("creating a new NodeDisruption") + disruption := &nodedisruptionv1alpha1.NodeDisruption{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "nodedisruption.criteo.com/v1alpha1", + Kind: "NodeDisruption", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: NDName, + Namespace: NDNamespace, + }, + Spec: nodedisruptionv1alpha1.NodeDisruptionSpec{ + NodeSelector: metav1.LabelSelector{MatchLabels: nodeLabels1}, + }, + } + Expect(k8sClient.Create(ctx, disruption.DeepCopy())).Should(Succeed()) + + By("checking the NodeDisruption is being granted") + createdDisruption := &nodedisruptionv1alpha1.NodeDisruption{} + + Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState { + err := k8sClient.Get(ctx, NDLookupKey, createdDisruption) + if err != nil { + panic("should be able to get") + } + return createdDisruption.Status.State + }, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted)) + + By("creating a budget that accepts everything") + Expect(k8sClient.Delete(ctx, ndb)).Should(Succeed()) + ndb.Spec.Freeze.Enabled = true + ndb.Spec.Freeze.Reason = "foobar" + Expect(k8sClient.Create(ctx, ndb)).Should(Succeed()) + + By("Re-creating a new NodeDisruption") + Expect(k8sClient.Delete(ctx, disruption)).Should(Succeed()) + Expect(k8sClient.Create(ctx, disruption.DeepCopy())).Should(Succeed()) + + By("checking the NodeDisruption is being rejected") + Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState { + err := k8sClient.Get(ctx, NDLookupKey, createdDisruption) + if err != nil { + panic("should be able to get") + } + return createdDisruption.Status.State + }, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Rejected)) + + Expect(createdDisruption.Status.DisruptedDisruptionBudgets[0].Reason).Should(Equal("Budget frozen: foobar")) + }) + }) + + When("there is a application disruption budget that is frozen", func() { + It("rejects the node disruption", func() { + By("creating a budget that accepts everything") + adb := &nodedisruptionv1alpha1.ApplicationDisruptionBudget{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "nodedisruption.criteo.com/v1alpha1", + Kind: "ApplicationDisruptionBudget", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: NDNamespace, + }, + Spec: nodedisruptionv1alpha1.ApplicationDisruptionBudgetSpec{ + PodSelector: metav1.LabelSelector{MatchLabels: podLabels}, + PVCSelector: metav1.LabelSelector{MatchLabels: podLabels}, + MaxDisruptions: 1, + }, + } + Expect(k8sClient.Create(ctx, adb.DeepCopy())).Should(Succeed()) + + By("creating a new NodeDisruption") + disruption := &nodedisruptionv1alpha1.NodeDisruption{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "nodedisruption.criteo.com/v1alpha1", + Kind: "NodeDisruption", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: NDName, + Namespace: NDNamespace, + }, + Spec: nodedisruptionv1alpha1.NodeDisruptionSpec{ + NodeSelector: metav1.LabelSelector{MatchLabels: nodeLabels1}, + }, + } + Expect(k8sClient.Create(ctx, disruption.DeepCopy())).Should(Succeed()) + + By("checking the NodeDisruption is being granted") + createdDisruption := &nodedisruptionv1alpha1.NodeDisruption{} + + Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState { + err := k8sClient.Get(ctx, NDLookupKey, createdDisruption) + if err != nil { + panic("should be able to get") + } + return createdDisruption.Status.State + }, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted)) + + By("creating a budget that accepts everything") + Expect(k8sClient.Delete(ctx, adb)).Should(Succeed()) + adb.Spec.Freeze.Enabled = true + adb.Spec.Freeze.Reason = "foobar" + Expect(k8sClient.Create(ctx, adb)).Should(Succeed()) + + By("Re-creating a new NodeDisruption") + Expect(k8sClient.Delete(ctx, disruption)).Should(Succeed()) + Expect(k8sClient.Create(ctx, disruption.DeepCopy())).Should(Succeed()) + + By("checking the NodeDisruption is being rejected") + Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState { + err := k8sClient.Get(ctx, NDLookupKey, createdDisruption) + if err != nil { + panic("should be able to get") + } + return createdDisruption.Status.State + }, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Rejected)) + + Expect(createdDisruption.Status.DisruptedDisruptionBudgets[0].Reason).Should(Equal("Budget frozen: foobar")) + }) + }) + When("there is a budget that doesn't support any disruption and retry is activated", func() { It("rejects the node disruption", func() { By("creating a budget that rejects everything") diff --git a/internal/controller/nodedisruptionbudget_controller.go b/internal/controller/nodedisruptionbudget_controller.go index 5a61a31..5591fc9 100644 --- a/internal/controller/nodedisruptionbudget_controller.go +++ b/internal/controller/nodedisruptionbudget_controller.go @@ -109,6 +109,11 @@ func PruneNDBMetrics(ref nodedisruptionv1alpha1.NamespacedName) { func UpdateNDBMetrics(ref nodedisruptionv1alpha1.NamespacedName, ndb *nodedisruptionv1alpha1.NodeDisruptionBudget) { DisruptionBudgetMaxDisruptedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(ndb.Spec.MaxDisruptedNodes)) DisruptionBudgetMinUndisruptedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(ndb.Spec.MinUndisruptedNodes)) + if ndb.Spec.Freeze.Enabled { + DisruptionBudgetFrozen.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(1) + } else { + DisruptionBudgetFrozen.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(0) + } UpdateBudgetStatusMetrics(ref, ndb.Status) } @@ -192,6 +197,14 @@ func (r *NodeDisruptionBudgetResolver) IsImpacted(disruptedNodes resolver.NodeSe // Return the number of disruption allowed considering a list of current node disruptions func (r *NodeDisruptionBudgetResolver) TryValidateDisruptionFromBudgetConstraints(disruptedNodes resolver.NodeSet) nodedisruptionv1alpha1.DisruptedBudgetStatus { + if r.NodeDisruptionBudget.Spec.Freeze.Enabled { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: r.GetNamespacedName(), + Reason: fmt.Sprintf("Budget frozen: %s", r.NodeDisruptionBudget.Spec.Freeze.Reason), + Ok: false, + } + } + watchedNodes := resolver.NewNodeSetFromStringList(r.NodeDisruptionBudget.Status.WatchedNodes) disruptedNodesCount := watchedNodes.Intersection(disruptedNodes).Len() if r.NodeDisruptionBudget.Status.DisruptionsAllowed-disruptedNodesCount >= 0 {