Skip to content

Commit

Permalink
Add metrics to node disruption (#52)
Browse files Browse the repository at this point in the history
Add the following metrics:
- state: reflect the state of the node disruption
- created and deadline: report as value the creation timestamp and the deadline
- impact nodes: report for each nodes a metric, helpful for interesection

Unittesting of metrics in go is not easy, it will be part of a future change
  • Loading branch information
geobeau authored Feb 21, 2024
1 parent 79b3985 commit e12ca3f
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 2 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ KINDCONFIG ?= $(shell pwd)/.kubecfg

## Tool Versions
KUSTOMIZE_VERSION ?= v5.0.1
CONTROLLER_TOOLS_VERSION ?= v0.12.0
CONTROLLER_TOOLS_VERSION ?= v0.14.0
KIND_VERSION ?= v0.20.0

.PHONY: kustomize
Expand Down
8 changes: 8 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (

nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1"
"github.com/criteo/node-disruption-controller/internal/controller"
"sigs.k8s.io/controller-runtime/pkg/metrics"
//+kubebuilder:scaffold:imports
)

Expand All @@ -47,6 +48,13 @@ func init() {

utilruntime.Must(nodedisruptionv1alpha1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme

metrics.Registry.MustRegister(
controller.NodeDisruptionState,
controller.NodeDisruptionCreated,
controller.NodeDisruptionDeadline,
controller.NodeDisruptionImpactedNodes,
)
}

func main() {
Expand Down
69 changes: 68 additions & 1 deletion internal/controller/nodedisruption_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1"
"github.com/criteo/node-disruption-controller/pkg/resolver"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
Expand All @@ -46,6 +47,37 @@ type NodeDisruptionReconcilerConfig struct {
RejectOverlappingDisruption bool
}

var (
NodeDisruptionState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "node_disruption_state",
Help: "State of node disruption: pending=0, rejected=1, accepted=2",
},
[]string{"node_disruption_name"},
)
NodeDisruptionCreated = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "node_disruption_created",
Help: "Date of create of the node disruption",
},
[]string{"node_disruption_name"},
)
NodeDisruptionDeadline = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "node_disruption_deadline",
Help: "Date of the deadline of the node disruption (0 if unset)",
},
[]string{"node_disruption_name"},
)
NodeDisruptionImpactedNodes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "node_disruption_impacted_node",
Help: "high cardinality: create a metric for each node impacted by a given node disruption",
},
[]string{"node_disruption_name", "node_name"},
)
)

// NodeDisruptionReconciler reconciles NodeDisruptions
type NodeDisruptionReconciler struct {
client.Client
Expand All @@ -72,11 +104,14 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque

if err != nil {
if errors.IsNotFound(err) {
// If the resource was not found, nothing has to be done
PruneNodeDisruptionMetric(req.NamespacedName.Name)
// If the ressource was not found, nothing has to be done
return clusterResult, nil
}
return clusterResult, err
}
logger.Info("Updating metrics")
UpdateNodeDisruptionMetric(nd)

logger.Info("Start reconcile of NodeDisruption", "state", nd.Status.State, "retryDate", nd.Status.NextRetryDate.Time)
if time.Now().Before(nd.Status.NextRetryDate.Time) {
Expand Down Expand Up @@ -105,6 +140,38 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
return clusterResult, nil
}

// PruneNodeDisruptionMetric remove metrics for a Node Disruption that don't exist anymore
func PruneNodeDisruptionMetric(nd_name string) {
NodeDisruptionState.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionCreated.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionDeadline.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionImpactedNodes.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
}

func UpdateNodeDisruptionMetric(nd *nodedisruptionv1alpha1.NodeDisruption) {
nd_state := 0
if nd.Status.State == nodedisruptionv1alpha1.Pending {
nd_state = 0
} else if nd.Status.State == nodedisruptionv1alpha1.Rejected {
nd_state = 1
} else if nd.Status.State == nodedisruptionv1alpha1.Granted {
nd_state = 2
}
NodeDisruptionState.WithLabelValues(nd.Name).Set(float64(nd_state))
NodeDisruptionCreated.WithLabelValues(nd.Name).Set(float64(nd.CreationTimestamp.Unix()))
// Deadline might not be set so it will be 0 but timestamp in Go are not Unix epoch
// so converting a 0 timestamp will not result in epoch 0. We override this to have nice values
deadline := nd.Spec.Retry.Deadline.Unix()
if nd.Spec.Retry.Deadline.IsZero() {
deadline = 0
}
NodeDisruptionDeadline.WithLabelValues(nd.Name).Set(float64(deadline))

for _, node_name := range nd.Status.DisruptedNodes {
NodeDisruptionImpactedNodes.WithLabelValues(nd.Name, node_name).Set(1)
}
}

// SetupWithManager sets up the controller with the Manager.
func (r *NodeDisruptionReconciler) SetupWithManager(mgr ctrl.Manager) error {
r.Recorder = mgr.GetEventRecorderFor("node-disruption-controller")
Expand Down

0 comments on commit e12ca3f

Please sign in to comment.