Skip to content

Commit

Permalink
Implement spec.type to Node Disruption API (#67)
Browse files Browse the repository at this point in the history
Implement #64

Signed-off-by: f.bouzghaia <f.bouzghaia@criteo.com>
Co-authored-by: f.bouzghaia <f.bouzghaia@criteo.com>
  • Loading branch information
FatmaBouzghaia and f.bouzghaia authored Jun 28, 2024
1 parent 7d7fd52 commit 1376a1d
Show file tree
Hide file tree
Showing 8 changed files with 173 additions and 26 deletions.
7 changes: 7 additions & 0 deletions DOC.md
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,13 @@ NodeDisruptionSpec defines the desired state of NodeDisruption
Configure the retrying behavior of a NodeDisruption<br/>
</td>
<td>false</td>
</tr><tr>
<td><b>type</b></td>
<td>string</td>
<td>
Type of the node disruption<br/>
</td>
<td>false</td>
</tr></tbody>
</table>

Expand Down
2 changes: 2 additions & 0 deletions api/v1alpha1/nodedisruption_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ type NodeDisruptionSpec struct {
// Label query over nodes that will be impacted by the disruption
NodeSelector metav1.LabelSelector `json:"nodeSelector,omitempty"`
Retry RetrySpec `json:"retry,omitempty"`
// Type of the node disruption
Type string `json:"type,omitempty"`
}

// Configure the retrying behavior of a NodeDisruption
Expand Down
3 changes: 3 additions & 0 deletions chart/templates/nodedisruption-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ spec:
description: Enable retrying
type: boolean
type: object
type:
description: Type of the node disruption
type: string
type: object
status:
description: NodeDisruptionStatus defines the observed state of NodeDisruption
Expand Down
4 changes: 4 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package main
import (
"flag"
"os"
"strings"
"time"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
Expand Down Expand Up @@ -56,6 +57,7 @@ func main() {
var rejectEmptyNodeDisruption bool
var retryInterval time.Duration
var rejectOverlappingDisruption bool
var nodeDisruptionTypes string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
Expand All @@ -64,6 +66,7 @@ func main() {
flag.BoolVar(&rejectEmptyNodeDisruption, "reject-empty-node-disruption", false, "Reject NodeDisruption matching no actual node.")
flag.DurationVar(&retryInterval, "retry-interval", controller.DefaultRetryInterval, "How long to wait between each retry (Default 60s)")
flag.BoolVar(&rejectOverlappingDisruption, "reject-overlapping-disruption", false, "Automatically reject any overlapping NodeDisruption (based on node selector), preserving the oldest one")
flag.StringVar(&nodeDisruptionTypes, "node-disruption-types", "", "The list of types allowed for a node disruption separated by a comma.")

opts := zap.Options{
Development: true,
Expand Down Expand Up @@ -104,6 +107,7 @@ func main() {
RejectEmptyNodeDisruption: rejectEmptyNodeDisruption,
RetryInterval: retryInterval,
RejectOverlappingDisruption: rejectOverlappingDisruption,
NodeDisruptionTypes: strings.Split(nodeDisruptionTypes, ","),
},
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NodeDisruption")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ spec:
description: Enable retrying
type: boolean
type: object
type:
description: Type of the node disruption
type: string
type: object
status:
description: NodeDisruptionStatus defines the observed state of NodeDisruption
Expand Down
21 changes: 14 additions & 7 deletions internal/controller/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,49 +17,56 @@ var (
Name: METIC_PREFIX + "node_disruption_granted_total",
Help: "Total number of granted node disruptions",
},
[]string{},
[]string{"type"},
)
NodeDisruptionRejectedTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "node_disruption_rejected_total",
Help: "Total number of rejected node disruptions",
},
[]string{},
[]string{"type"},
)
NodeDisruptionStateAsValue = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_state_value",
Help: "State of node disruption: pending=0, rejected=-1, accepted=1",
},
[]string{"node_disruption_name"},
[]string{"node_disruption_name", "type"},
)
NodeDisruptionStateAsLabel = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_state_label",
Help: "State of node disruption: 0 not in this state; 1 is in state",
},
[]string{"node_disruption_name", "state"},
[]string{"node_disruption_name", "state", "type"},
)
NodeDisruptionCreated = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_created",
Help: "Date of create of the node disruption",
},
[]string{"node_disruption_name"},
[]string{"node_disruption_name", "type"},
)
NodeDisruptionDeadline = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_deadline",
Help: "Date of the deadline of the node disruption (0 if unset)",
},
[]string{"node_disruption_name"},
[]string{"node_disruption_name", "type"},
)
NodeDisruptionImpactedNodes = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_impacted_node",
Help: "high cardinality: create a metric for each node impacted by a given node disruption",
},
[]string{"node_disruption_name", "node_name"},
[]string{"node_disruption_name", "node_name", "type"},
)
NodeDisruptionType = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_type",
Help: "Type of the node disruption",
},
[]string{"node_disruption_name", "type"},
)
// DISRUPTION BUDGET METRICS
DisruptionBudgetCheckHealthHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec(
Expand Down
43 changes: 28 additions & 15 deletions internal/controller/nodedisruption_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
"k8s.io/utils/strings/slices"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
Expand All @@ -45,6 +46,8 @@ type NodeDisruptionReconcilerConfig struct {
RetryInterval time.Duration
// Reject NodeDisruption if its node selector overlaps an older NodeDisruption's selector
RejectOverlappingDisruption bool
// Specify which node disruption types are allowed to be granted
NodeDisruptionTypes []string
}

// NodeDisruptionReconciler reconciles NodeDisruptions
Expand Down Expand Up @@ -116,39 +119,40 @@ func PruneNodeDisruptionMetrics(nd_name string) {
NodeDisruptionCreated.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionDeadline.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionImpactedNodes.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionType.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
}

// UpdateNodeDisruptionMetrics update metrics for a Node Disruption
func UpdateNodeDisruptionMetrics(nd *nodedisruptionv1alpha1.NodeDisruption) {
nd_state := 0
if nd.Status.State == nodedisruptionv1alpha1.Pending {
nd_state = 0
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending), nd.Spec.Type).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted), nd.Spec.Type).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected), nd.Spec.Type).Set(0)
} else if nd.Status.State == nodedisruptionv1alpha1.Rejected {
nd_state = -1
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending), nd.Spec.Type).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected), nd.Spec.Type).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted), nd.Spec.Type).Set(0)
} else if nd.Status.State == nodedisruptionv1alpha1.Granted {
nd_state = 1
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending), nd.Spec.Type).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected), nd.Spec.Type).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted), nd.Spec.Type).Set(1)
}
NodeDisruptionStateAsValue.WithLabelValues(nd.Name).Set(float64(nd_state))
NodeDisruptionCreated.WithLabelValues(nd.Name).Set(float64(nd.CreationTimestamp.Unix()))
NodeDisruptionStateAsValue.WithLabelValues(nd.Name, nd.Spec.Type).Set(float64(nd_state))
NodeDisruptionCreated.WithLabelValues(nd.Name, nd.Spec.Type).Set(float64(nd.CreationTimestamp.Unix()))
// Deadline might not be set so it will be 0 but timestamp in Go are not Unix epoch
// so converting a 0 timestamp will not result in epoch 0. We override this to have nice values
deadline := nd.Spec.Retry.Deadline.Unix()
if nd.Spec.Retry.Deadline.IsZero() {
deadline = 0
}
NodeDisruptionDeadline.WithLabelValues(nd.Name).Set(float64(deadline))
NodeDisruptionDeadline.WithLabelValues(nd.Name, nd.Spec.Type).Set(float64(deadline))

for _, node_name := range nd.Status.DisruptedNodes {
NodeDisruptionImpactedNodes.WithLabelValues(nd.Name, node_name).Set(1)
NodeDisruptionImpactedNodes.WithLabelValues(nd.Name, node_name, nd.Spec.Type).Set(1)
}
}

Expand Down Expand Up @@ -195,9 +199,9 @@ func (ndr *SingleNodeDisruptionReconciler) TryTransitionState(ctx context.Contex
return err
}
if ndr.NodeDisruption.Status.State == nodedisruptionv1alpha1.Granted {
NodeDisruptionGrantedTotal.WithLabelValues().Inc()
NodeDisruptionGrantedTotal.WithLabelValues(ndr.NodeDisruption.Spec.Type).Inc()
} else if ndr.NodeDisruption.Status.State == nodedisruptionv1alpha1.Rejected {
NodeDisruptionRejectedTotal.WithLabelValues().Inc()
NodeDisruptionRejectedTotal.WithLabelValues(ndr.NodeDisruption.Spec.Type).Inc()
}
}
// If the disruption is not Pending nor unknown, the state is final
Expand Down Expand Up @@ -331,6 +335,15 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithInternalConstraints(ctx c
return anyFailed, []nodedisruptionv1alpha1.DisruptedBudgetStatus{status}, err
}

if len(ndr.Config.NodeDisruptionTypes) != 0 && !slices.Contains(ndr.Config.NodeDisruptionTypes, ndr.NodeDisruption.Spec.Type) {
status := nodedisruptionv1alpha1.DisruptedBudgetStatus{
Reference: ndr.getNodeDisruptionReference(),
Reason: "Type provided of node disruption is not managed",
Ok: false,
}
return true, []nodedisruptionv1alpha1.DisruptedBudgetStatus{status}, nil
}

return false, statuses, nil
}

Expand Down
116 changes: 112 additions & 4 deletions internal/controller/nodedisruption_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ func startDummyHTTPServer(handle http.HandlerFunc, listenAddr string) (cancelFn
return func() { _ = srv.Shutdown(context.Background()) }
}

func createNodeDisruption(name string, namespace string, nodeSelectorLabel map[string]string, ctx context.Context) {
func createNodeDisruption(name string, namespace string, nodeSelectorLabel map[string]string, disruptionType string, ctx context.Context) {
overlappingDisruption := &nodedisruptionv1alpha1.NodeDisruption{
TypeMeta: metav1.TypeMeta{
APIVersion: "nodedisruption.criteo.com/v1alpha1",
Expand All @@ -124,6 +124,7 @@ func createNodeDisruption(name string, namespace string, nodeSelectorLabel map[s
},
Spec: nodedisruptionv1alpha1.NodeDisruptionSpec{
NodeSelector: metav1.LabelSelector{MatchLabels: nodeSelectorLabel},
Type: disruptionType,
},
}
Expect(k8sClient.Create(ctx, overlappingDisruption.DeepCopy())).Should(Succeed())
Expand Down Expand Up @@ -644,7 +645,7 @@ var _ = Describe("NodeDisruption controller", func() {

BeforeEach(func() {
By("configuring a first disruption")
createNodeDisruption(firstDisruptionName, NDNamespace, nodeLabels1, ctx)
createNodeDisruption(firstDisruptionName, NDNamespace, nodeLabels1, "", ctx)
})
AfterEach(func() {
clearAllNodeDisruptionResources()
Expand All @@ -664,7 +665,7 @@ var _ = Describe("NodeDisruption controller", func() {
})

By("creating an overlapping disruption")
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, ctx)
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, "", ctx)
})
It("rejects the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
Expand All @@ -691,7 +692,7 @@ var _ = Describe("NodeDisruption controller", func() {
})

By("creating an overlapping disruption")
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, ctx)
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, "", ctx)
})
It("accepts the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
Expand All @@ -705,5 +706,112 @@ var _ = Describe("NodeDisruption controller", func() {
})
})
})

Describe("Reject typed disruptions feature", Label("Node disruption type"), Ordered, func() {
var (
createdDisruption = &nodedisruptionv1alpha1.NodeDisruption{}
disruptionName = "disruption-test"
allowedDisruptionTypes = []string{"maintenance", "decommission", "tor-maintenance"}
)

AfterEach(func() {
clearAllNodeDisruptionResources()
cancelFn()
})

Context("NodeDisruptionTypes is enabled", func() {
When("the created disruption has an allowed type", func() {
BeforeEach(func() {
By("Configuring a disruption")
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "maintenance", ctx)

By("starting a reconciler with NodeDisruptionTypes enabled")
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
RejectOverlappingDisruption: false,
RetryInterval: time.Second * 1,
NodeDisruptionTypes: allowedDisruptionTypes,
})
})
It("grants the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
if err != nil {
panic("should be able to get")
}
return createdDisruption.Status.State
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted))
})
})
When("the created disruption has not an allowed type", func() {
BeforeEach(func() {
By("Configuring a disruption")
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "toto", ctx)

By("starting a reconciler with NodeDisruptionTypes enabled")
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
RejectOverlappingDisruption: false,
RetryInterval: time.Second * 1,
NodeDisruptionTypes: allowedDisruptionTypes,
})
})
It("rejects the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
if err != nil {
panic("should be able to get")
}
return createdDisruption.Status.State
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Rejected))
})
})
})

Context("NodeDisruptionTypes is disabled", func() {
When("the created disruption has a type", func() {
BeforeEach(func() {
By("Configuring a disruption")
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "maintenance", ctx)

By("starting a reconciler with NodeDisruptionTypes enabled")
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
RejectOverlappingDisruption: false,
RetryInterval: time.Second * 1,
NodeDisruptionTypes: []string{},
})
})
It("grants the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
if err != nil {
panic("should be able to get")
}
return createdDisruption.Status.State
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted))
})
})
When("the created disruption has not a type", func() {
BeforeEach(func() {
By("Configuring a disruption")
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "", ctx)

By("starting a reconciler with NodeDisruptionTypes enabled")
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
RejectOverlappingDisruption: false,
RetryInterval: time.Second * 1,
NodeDisruptionTypes: []string{},
})
})
It("grants the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
if err != nil {
panic("should be able to get")
}
return createdDisruption.Status.State
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted))
})
})
})
})
})
})

0 comments on commit 1376a1d

Please sign in to comment.