diff --git a/Makefile b/Makefile index 5f10d34..e3b490e 100644 --- a/Makefile +++ b/Makefile @@ -284,7 +284,8 @@ clean: ## clean files .PHONY: manifests manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. - $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./api/..." output:crd:artifacts:config=config/crd/bases + $(CONTROLLER_GEN) crd paths="./api/..." output:crd:artifacts:config=config/crd/bases + $(CONTROLLER_GEN) rbac:roleName=manager-role webhook paths="./internal/controller/..." cp -f config/crd/bases/* deployment/maintenance-operator-chart/crds .PHONY: generate @@ -292,7 +293,7 @@ generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./api/..." .PHONY: test -test: unit-test lint +test: lint unit-test .PHONY: unit-test unit-test: envtest ## Run unit tests. diff --git a/api/go.mod b/api/go.mod index 4469676..79d41a3 100644 --- a/api/go.mod +++ b/api/go.mod @@ -5,9 +5,9 @@ go 1.22.0 toolchain go1.22.10 require ( - k8s.io/api v0.31.0 - k8s.io/apimachinery v0.31.0 - sigs.k8s.io/controller-runtime v0.19.0 + k8s.io/api v0.31.4 + k8s.io/apimachinery v0.31.4 + sigs.k8s.io/controller-runtime v0.19.4 ) require ( diff --git a/api/go.sum b/api/go.sum index 8b1e67c..70064d1 100644 --- a/api/go.sum +++ b/api/go.sum @@ -93,16 +93,16 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.31.0 h1:b9LiSjR2ym/SzTOlfMHm1tr7/21aD7fSkqgD/CVJBCo= -k8s.io/api v0.31.0/go.mod h1:0YiFF+JfFxMM6+1hQei8FY8M7s1Mth+z/q7eF1aJkTE= -k8s.io/apimachinery v0.31.0 h1:m9jOiSr3FoSSL5WO9bjm1n6B9KROYYgNZOb4tyZ1lBc= -k8s.io/apimachinery v0.31.0/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= +k8s.io/api v0.31.4 h1:I2QNzitPVsPeLQvexMEsj945QumYraqv9m74isPDKhM= +k8s.io/api v0.31.4/go.mod h1:d+7vgXLvmcdT1BCo79VEgJxHHryww3V5np2OYTr6jdw= +k8s.io/apimachinery v0.31.4 h1:8xjE2C4CzhYVm9DGf60yohpNUh5AEBnPxCryPBECmlM= +k8s.io/apimachinery v0.31.4/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/controller-runtime v0.19.0 h1:nWVM7aq+Il2ABxwiCizrVDSlmDcshi9llbaFbC0ji/Q= -sigs.k8s.io/controller-runtime v0.19.0/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= +sigs.k8s.io/controller-runtime v0.19.4 h1:SUmheabttt0nx8uJtoII4oIP27BVVvAKFvdvGFwV/Qo= +sigs.k8s.io/controller-runtime v0.19.4/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= diff --git a/cmd/maintenance-manager/main.go b/cmd/maintenance-manager/main.go index 106eb6c..c2570ae 100644 --- a/cmd/maintenance-manager/main.go +++ b/cmd/maintenance-manager/main.go @@ -27,6 +27,8 @@ import ( "k8s.io/client-go/kubernetes" _ "k8s.io/client-go/plugin/pkg/client/auth" + ocpconfigv1 "github.com/openshift/api/config/v1" + mcv1 "github.com/openshift/api/machineconfiguration/v1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -40,6 +42,7 @@ import ( "github.com/Mellanox/maintenance-operator/internal/cordon" "github.com/Mellanox/maintenance-operator/internal/drain" operatorlog "github.com/Mellanox/maintenance-operator/internal/log" + "github.com/Mellanox/maintenance-operator/internal/openshift" "github.com/Mellanox/maintenance-operator/internal/podcompletion" "github.com/Mellanox/maintenance-operator/internal/scheduler" "github.com/Mellanox/maintenance-operator/internal/version" @@ -54,6 +57,8 @@ var ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(ocpconfigv1.Install(scheme)) + utilruntime.Must(mcv1.Install(scheme)) utilruntime.Must(maintenancev1alpha1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme @@ -147,12 +152,24 @@ func main() { ctx := ctrl.SetupSignalHandler() mgrClient := mgr.GetClient() + ocpUtils, err := openshift.NewOpenshiftUtils(ctx, mgr.GetAPIReader()) + if err != nil { + setupLog.Error(err, "unable to create openshift utils") + os.Exit(1) + } + + if ocpUtils.IsOpenshift() { + setupLog.Info("openshift cluster detected", + "isOpenshift", ocpUtils.IsOpenshift(), "isHypershift", ocpUtils.IsHypershift()) + } + if err = (&controller.NodeMaintenanceReconciler{ Client: mgrClient, Scheme: mgr.GetScheme(), CordonHandler: cordon.NewCordonHandler(mgrClient, k8sInterface), WaitPodCompletionHandler: podcompletion.NewPodCompletionHandler(mgrClient), DrainManager: drain.NewManager(ctrl.Log.WithName("DrainManager"), ctx, k8sInterface), + MCPManager: openshift.NewMCPManager(ocpUtils, mgrClient), }).SetupWithManager(ctx, mgr, ctrl.Log.WithName("NodeMaintenanceReconciler")); err != nil { setupLog.Error(err, "unable to create controller", "controller", "NodeMaintenance") os.Exit(1) diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 5860f13..1d1cdc0 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -53,34 +53,35 @@ rules: - list - watch - apiGroups: - - maintenance.nvidia.com + - config.openshift.io resources: - - maintenanceoperatorconfigs + - infrastructures verbs: - - create - - delete - get - list - - patch - - update - watch - apiGroups: - - maintenance.nvidia.com + - machineconfiguration.openshift.io resources: - - maintenanceoperatorconfigs/finalizers + - machineconfigpools verbs: + - get + - list + - patch - update + - watch - apiGroups: - - maintenance.nvidia.com + - machineconfiguration.openshift.io resources: - - maintenanceoperatorconfigs/status + - machineconfigs verbs: - get - - patch - - update + - list + - watch - apiGroups: - maintenance.nvidia.com resources: + - maintenanceoperatorconfigs - nodemaintenances verbs: - create @@ -93,12 +94,14 @@ rules: - apiGroups: - maintenance.nvidia.com resources: + - maintenanceoperatorconfigs/finalizers - nodemaintenances/finalizers verbs: - update - apiGroups: - maintenance.nvidia.com resources: + - maintenanceoperatorconfigs/status - nodemaintenances/status verbs: - get diff --git a/deployment/maintenance-operator-chart/templates/role.yaml b/deployment/maintenance-operator-chart/templates/role.yaml index 1e303bd..464e1a1 100644 --- a/deployment/maintenance-operator-chart/templates/role.yaml +++ b/deployment/maintenance-operator-chart/templates/role.yaml @@ -1,3 +1,4 @@ +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: @@ -5,22 +6,6 @@ metadata: labels: {{- include "maintenance-operator.labels" . | nindent 4 }} rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create -- nonResourceURLs: - - /metrics - verbs: - - get - apiGroups: - "" resources: @@ -62,34 +47,43 @@ rules: - patch - update - apiGroups: - - maintenance.nvidia.com + - apps resources: - - maintenanceoperatorconfigs + - daemonsets verbs: - - create - - delete - get - list - - patch - - update - watch - apiGroups: - - maintenance.nvidia.com + - config.openshift.io resources: - - maintenanceoperatorconfigs/finalizers + - infrastructures verbs: - - update + - get + - list + - watch - apiGroups: - - maintenance.nvidia.com + - machineconfiguration.openshift.io resources: - - maintenanceoperatorconfigs/status + - machineconfigpools verbs: - get + - list - patch - update + - watch +- apiGroups: + - machineconfiguration.openshift.io + resources: + - machineconfigs + verbs: + - get + - list + - watch - apiGroups: - maintenance.nvidia.com resources: + - maintenanceoperatorconfigs - nodemaintenances verbs: - create @@ -102,26 +96,19 @@ rules: - apiGroups: - maintenance.nvidia.com resources: + - maintenanceoperatorconfigs/finalizers - nodemaintenances/finalizers verbs: - update - apiGroups: - maintenance.nvidia.com resources: + - maintenanceoperatorconfigs/status - nodemaintenances/status verbs: - get - patch - update -- apiGroups: - - apps - resources: - - daemonsets - verbs: - - get - - list - - watch - --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role diff --git a/go.mod b/go.mod index 7c123f5..e81b421 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/go-logr/logr v1.4.2 github.com/onsi/ginkgo/v2 v2.22.2 github.com/onsi/gomega v1.36.2 + github.com/openshift/api v0.0.0-20250102185430-d6d8306a24ec github.com/pkg/errors v0.9.1 github.com/stretchr/testify v1.10.0 go.uber.org/zap v1.27.0 @@ -32,7 +33,6 @@ require ( github.com/chai2010/gettext-go v1.0.2 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect - github.com/evanphx/json-patch v4.12.0+incompatible // indirect github.com/evanphx/json-patch/v5 v5.9.0 // indirect github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect @@ -54,8 +54,8 @@ require ( github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/websocket v1.5.0 // indirect - github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect - github.com/imdario/mergo v0.3.6 // indirect + github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect + github.com/imdario/mergo v0.3.13 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect @@ -77,7 +77,7 @@ require ( github.com/prometheus/procfs v0.15.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/spf13/cobra v1.8.1 // indirect - github.com/spf13/pflag v1.0.5 // indirect + github.com/spf13/pflag v1.0.6-0.20210604193023-d5e0c0615ace // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/x448/float16 v0.8.4 // indirect github.com/xlab/treeprint v1.2.0 // indirect diff --git a/go.sum b/go.sum index 38c0bf8..43d2075 100644 --- a/go.sum +++ b/go.sum @@ -31,8 +31,8 @@ github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxER github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84= -github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= +github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d h1:105gxyaGwCFad8crR9dcMQWvV9Hvulu6hwUh4tWPJnM= @@ -96,10 +96,10 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= -github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 h1:pdN6V1QBWetyv/0+wjACpqVH+eVULgEjkurDLq3goeM= -github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= -github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28= -github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= +github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk= +github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -140,6 +140,8 @@ github.com/onsi/ginkgo/v2 v2.22.2 h1:/3X8Panh8/WwhU/3Ssa6rCKqPLuAkVY2I0RoyDLySlU github.com/onsi/ginkgo/v2 v2.22.2/go.mod h1:oeMosUL+8LtarXBHu/c0bx2D/K9zyQ6uX3cTyztHwsk= github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8= github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY= +github.com/openshift/api v0.0.0-20250102185430-d6d8306a24ec h1:VEDRGJmiYeN0V0xW1aI9wfzEMgaMZOVasy3FzEz27Lo= +github.com/openshift/api v0.0.0-20250102185430-d6d8306a24ec/go.mod h1:Shkl4HanLwDiiBzakv+con/aMGnVE2MAGvoKp5oyYUo= github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -164,8 +166,9 @@ github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.6-0.20210604193023-d5e0c0615ace h1:9PNP1jnUjRhfmGMlkXHjYPishpcw4jpSt/V/xYY3FMA= +github.com/spf13/pflag v1.0.6-0.20210604193023-d5e0c0615ace/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -286,6 +289,7 @@ gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/internal/controller/nodemaintenance_controller.go b/internal/controller/nodemaintenance_controller.go index f487629..f202626 100644 --- a/internal/controller/nodemaintenance_controller.go +++ b/internal/controller/nodemaintenance_controller.go @@ -47,6 +47,7 @@ import ( "github.com/Mellanox/maintenance-operator/internal/drain" "github.com/Mellanox/maintenance-operator/internal/k8sutils" operatorlog "github.com/Mellanox/maintenance-operator/internal/log" + "github.com/Mellanox/maintenance-operator/internal/openshift" "github.com/Mellanox/maintenance-operator/internal/podcompletion" "github.com/Mellanox/maintenance-operator/internal/utils" ) @@ -55,6 +56,7 @@ var ( waitPodCompletionRequeueTime = 10 * time.Second drainReqeueTime = 10 * time.Second additionalRequestorsRequeueTime = 10 * time.Second + pauseMCPRequeueTime = 10 * time.Second ) const ( @@ -71,6 +73,7 @@ type NodeMaintenanceReconciler struct { CordonHandler cordon.Handler WaitPodCompletionHandler podcompletion.Handler DrainManager drain.Manager + MCPManager openshift.MCPManager } //+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances,verbs=get;list;watch;create;update;patch;delete @@ -81,6 +84,9 @@ type NodeMaintenanceReconciler struct { //+kubebuilder:rbac:groups="",resources=pods,verbs=get;watch;list;update;patch;delete //+kubebuilder:rbac:groups="",resources=pods/eviction,verbs=create;get;list;update;patch;delete //+kubebuilder:rbac:groups="apps",resources=daemonsets,verbs=get;watch;list +//+kubebuilder:rbac:groups="config.openshift.io",resources=infrastructures,verbs=get;watch;list +//+kubebuilder:rbac:groups="machineconfiguration.openshift.io",resources=machineconfigs,verbs=get;watch;list +//+kubebuilder:rbac:groups="machineconfiguration.openshift.io",resources=machineconfigpools,verbs=get;watch;list;update;patch // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. @@ -140,7 +146,7 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ reqLog.Error(err, "failed to handle uninitialized state for NodeMaintenance object") } case maintenancev1.ConditionReasonScheduled: - err = r.handleScheduledState(ctx, reqLog, nm) + res, err = r.handleScheduledState(ctx, reqLog, nm, node) if err != nil { reqLog.Error(err, "failed to handle scheduled state for NodeMaintenance object") } @@ -200,37 +206,52 @@ func (r *NodeMaintenanceReconciler) handleUninitiaizedState(ctx context.Context, // handleScheduledState handles NodeMaintenance in ConditionReasonScheduled state // it eventually sets NodeMaintenance Ready condition Reason to ConditionReasonWaitForPodCompletion -func (r *NodeMaintenanceReconciler) handleScheduledState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance) error { +func (r *NodeMaintenanceReconciler) handleScheduledState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) (ctrl.Result, error) { reqLog.Info("Handle Scheduled NodeMaintenance") var err error + var res ctrl.Result if nm.GetDeletionTimestamp().IsZero() { // conditionally add finalizer err = k8sutils.AddFinalizer(ctx, r.Client, nm, maintenancev1.MaintenanceFinalizerName) if err != nil { reqLog.Error(err, "failed to set finalizer for NodeMaintenance") - return err + return res, err } } else { // object is being deleted, remove finalizer if exists and return reqLog.Info("NodeMaintenance object is deleting") - return r.handleFinalizerRemoval(ctx, reqLog, nm) + + err = r.MCPManager.UnpauseMCP(ctx, node, nm) + if err != nil { + return res, err + } + + return res, r.handleFinalizerRemoval(ctx, reqLog, nm) } - // TODO(adrianc): in openshift, we should pause MCP here + err = r.MCPManager.PauseMCP(ctx, node, nm) + if err != nil { + if errors.Is(err, openshift.ErrMachineConfigBusy) { + reqLog.Info("machine config pool is busy, requeue", "error", err) + return ctrl.Result{Requeue: true, RequeueAfter: pauseMCPRequeueTime}, nil + } + reqLog.Error(err, "failed to pause MachineConfigPool") + return res, fmt.Errorf("failed to pause MachineConfigPool. %w", err) + } // Set Ready condition to ConditionReasonCordon and update object err = k8sutils.SetReadyConditionReason(ctx, r.Client, nm, maintenancev1.ConditionReasonCordon) if err != nil { reqLog.Error(err, "failed to update status for NodeMaintenance object") - return err + return res, err } // emit state change event r.EventRecorder.Event( nm, corev1.EventTypeNormal, maintenancev1.ConditionChangedEventType, maintenancev1.ConditionReasonCordon) - return nil + return res, nil } func (r *NodeMaintenanceReconciler) handleCordonState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) error { @@ -248,7 +269,11 @@ func (r *NodeMaintenanceReconciler) handleCordonState(ctx context.Context, reqLo } } - // TODO(adrianc): unpause MCP in OCP when support is added. + err = r.MCPManager.UnpauseMCP(ctx, node, nm) + if err != nil { + return err + } + return r.handleFinalizerRemoval(ctx, reqLog, nm) } @@ -289,7 +314,11 @@ func (r *NodeMaintenanceReconciler) handleWaitPodCompletionState(ctx context.Con } } - // TODO(adrianc): unpause MCP in OCP when support is added. + err = r.MCPManager.UnpauseMCP(ctx, node, nm) + if err != nil { + return res, err + } + err = r.handleFinalizerRemoval(ctx, reqLog, nm) return res, err } @@ -331,29 +360,7 @@ func (r *NodeMaintenanceReconciler) handleDrainState(ctx context.Context, reqLog if !nm.GetDeletionTimestamp().IsZero() { // object is being deleted, handle cleanup. reqLog.Info("NodeMaintenance object is deleting") - - reqLog.Info("handle drain request removal") - drainReqUID := drain.DrainRequestUIDFromNodeMaintenance(nm) - req := r.DrainManager.GetRequest(drainReqUID) - if req != nil { - reqLog.Info("stopping and removing drain request", "reqUID", drainReqUID, "state", req.State()) - } - r.DrainManager.RemoveRequest(drainReqUID) - - if nm.Spec.Cordon { - reqLog.Info("handle uncordon of node, ", "node", node.Name) - err = r.CordonHandler.HandleUnCordon(ctx, reqLog, nm, node) - if err != nil { - return res, err - } - } - - // TODO(adrianc): unpause MCP in OCP when support is added. - - // remove finalizer if exists and return - reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer") - err = r.handleFinalizerRemoval(ctx, reqLog, nm) - return res, err + return r.handleDrainStateDelete(ctx, reqLog, nm, node) } if nm.Spec.DrainSpec != nil { @@ -429,6 +436,37 @@ func (r *NodeMaintenanceReconciler) handleDrainState(ctx context.Context, reqLog return res, nil } +func (r *NodeMaintenanceReconciler) handleDrainStateDelete(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) (ctrl.Result, error) { + var res ctrl.Result + var err error + + reqLog.Info("handle drain request removal") + drainReqUID := drain.DrainRequestUIDFromNodeMaintenance(nm) + req := r.DrainManager.GetRequest(drainReqUID) + if req != nil { + reqLog.Info("stopping and removing drain request", "reqUID", drainReqUID, "state", req.State()) + } + r.DrainManager.RemoveRequest(drainReqUID) + + if nm.Spec.Cordon { + reqLog.Info("handle uncordon of node, ", "node", node.Name) + err = r.CordonHandler.HandleUnCordon(ctx, reqLog, nm, node) + if err != nil { + return res, err + } + } + + err = r.MCPManager.UnpauseMCP(ctx, node, nm) + if err != nil { + return res, err + } + + // remove finalizer if exists and return + reqLog.Info("NodeMaintenance object is deleting, removing maintenance finalizer") + err = r.handleFinalizerRemoval(ctx, reqLog, nm) + return res, err +} + // updateDrainStatus updates NodeMaintenance drain status in place. returns error if occurred func (r *NodeMaintenanceReconciler) updateDrainStatus(ctx context.Context, nm *maintenancev1.NodeMaintenance, drainReq drain.DrainRequest) error { ds, err := drainReq.Status() @@ -489,7 +527,10 @@ func (r *NodeMaintenanceReconciler) handleTerminalState(ctx context.Context, req } } - // TODO(adrianc): unpause MCP in OCP when support is added. + err = r.MCPManager.UnpauseMCP(ctx, node, nm) + if err != nil { + return res, err + } // remove finalizer if exists and return err = r.handleFinalizerRemoval(ctx, reqLog, nm) diff --git a/internal/controller/nodemaintenance_controller_test.go b/internal/controller/nodemaintenance_controller_test.go index b1160b2..20b05d0 100644 --- a/internal/controller/nodemaintenance_controller_test.go +++ b/internal/controller/nodemaintenance_controller_test.go @@ -38,6 +38,7 @@ import ( "github.com/Mellanox/maintenance-operator/internal/cordon" "github.com/Mellanox/maintenance-operator/internal/drain" "github.com/Mellanox/maintenance-operator/internal/k8sutils" + "github.com/Mellanox/maintenance-operator/internal/openshift" "github.com/Mellanox/maintenance-operator/internal/podcompletion" "github.com/Mellanox/maintenance-operator/internal/testutils" ) @@ -81,6 +82,7 @@ var _ = Describe("NodeMaintenance Controller", func() { WaitPodCompletionHandler: podcompletion.NewPodCompletionHandler(k8sClient), DrainManager: drain.NewManager(ctrllog.Log.WithName("DrainManager"), testCtx, k8sInterface), + MCPManager: openshift.NewNoOpMcpManager(), } // setup reconciler with manager diff --git a/internal/openshift/mcp.go b/internal/openshift/mcp.go new file mode 100644 index 0000000..9d94df1 --- /dev/null +++ b/internal/openshift/mcp.go @@ -0,0 +1,328 @@ +/* + Copyright 2025, NVIDIA CORPORATION & AFFILIATES + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package openshift + +import ( + "context" + "errors" + "fmt" + "sync" + "time" + + mcv1 "github.com/openshift/api/machineconfiguration/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + "sigs.k8s.io/controller-runtime/pkg/client" + + maintenancev1 "github.com/Mellanox/maintenance-operator/api/v1alpha1" +) + +const ( + // InfraResourceName is the name of the openshift infrastructure resource + InfraResourceName = "cluster" + // DesiredMachineConfigAnnotationKey is used to specify the desired MachineConfig for a machine + DesiredMachineConfigAnnotationKey = "machineconfiguration.openshift.io/desiredConfig" + + // McpPausedAnnotKey is the annotations used to mark a MachineConfigPool as paused by the operator + McpPausedAnnotKey = "maintenance.nvidia.com/mcp-paused" + McpPausedAnnotValue = "true" + + // McpNameLabelKey is the label that contains the name of the MachineConfigPool which was paused. it is set on NodeMaintenance object + McpNameLabelKey = "maintenance.nvidia.com/paused-mcp-name" +) + +// ErrMachineConfigBusy is returned when MachineConfigPool is busy, either currently under configuration or is paused by another entity +// operation should be retried at a later time +var ErrMachineConfigBusy = errors.New("machineconfigpool busy") + +// MCPManager manages MachineConfigPool operations +type MCPManager interface { + // PauseMCP pauses the MachineConfigPool on the given node + PauseMCP(ctx context.Context, node *corev1.Node, nm *maintenancev1.NodeMaintenance) error + // UnpauseMCP unpauses the MachineConfigPool on the given node + UnpauseMCP(ctx context.Context, node *corev1.Node, nm *maintenancev1.NodeMaintenance) error +} + +// NewMCPManager returns a new MCPManager based on the cluster type +func NewMCPManager(ocpUtils OpenshiftUtils, client client.Client) MCPManager { + if ocpUtils.IsOpenshift() && !ocpUtils.IsHypershift() { + return NewOpenshiftMcpManager(client) + } + return NewNoOpMcpManager() +} + +// noOpMcpManager implements MCPManager with no-op methods +type noOpMcpManager struct{} + +func (n *noOpMcpManager) PauseMCP(ctx context.Context, node *corev1.Node, nm *maintenancev1.NodeMaintenance) error { + return nil +} + +func (n *noOpMcpManager) UnpauseMCP(ctx context.Context, node *corev1.Node, nm *maintenancev1.NodeMaintenance) error { + return nil +} + +// NewNoOpMcpManager returns a no-op MCPManager. used for non-openshift clusters +func NewNoOpMcpManager() MCPManager { + return &noOpMcpManager{} +} + +// openshiftMcpManager implements MCPManager for openshift clusters +type openshiftMcpManager struct { + client client.Client + mu sync.Mutex +} + +// NewOpenshiftMcpManager returns a new MCPManager. used for openshift clusters +func NewOpenshiftMcpManager(client client.Client) MCPManager { + return &openshiftMcpManager{client: client, mu: sync.Mutex{}} +} + +func (o *openshiftMcpManager) PauseMCP(ctx context.Context, node *corev1.Node, nm *maintenancev1.NodeMaintenance) error { + o.mu.Lock() + defer o.mu.Unlock() + + mcpName, err := o.getMCPName(ctx, node) + if err != nil { + return fmt.Errorf("failed to pause MachineConfigPool. failed to get pool for node %s: %w", node.Name, err) + } + + // set mcp name annotation on NodeMaintenance + if nm.Labels[McpNameLabelKey] != mcpName { + metav1.SetMetaDataLabel(&nm.ObjectMeta, McpNameLabelKey, mcpName) + err = o.client.Update(ctx, nm) + if err != nil { + return fmt.Errorf("failed to pause MachineConfigPool. failed to set MachineConfigPool name annotation on NodeMaintenance %s: %w", nm.Name, err) + } + + // wait for mcp name label of nodeMaintenance to be reflected back in cache sice we depend on this label for MCP unpause (of other NodeMaintenances) + err = waitForObjectUpdate(ctx, o.client, nm, func(obj client.Object) bool { + return obj.GetLabels()[McpNameLabelKey] == mcpName + }) + if err != nil { + return fmt.Errorf("failed to pause MachineConfigPool %s. failed while waiting for NodeMaintenance label to update: %w", mcpName, err) + } + } + + mcp := &mcv1.MachineConfigPool{} + err = o.client.Get(ctx, client.ObjectKey{Name: mcpName}, mcp) + if err != nil { + return fmt.Errorf("failed to pause MachineConfigPool. failed to get MachineConfigPool %s: %w", mcpName, err) + } + + // if already paused by the operator return + if mcp.Annotations[McpPausedAnnotKey] == McpPausedAnnotValue { + return nil + } + + // if paused, but not by the operator return with error ErrMachineConfigBusy + if mcp.Spec.Paused && !metav1.HasAnnotation(mcp.ObjectMeta, McpPausedAnnotKey) { + return fmt.Errorf("failed to pause MachineConfigPool. %s is already paused by another entity. %w", mcpName, ErrMachineConfigBusy) + } + + // check if machine config is doing something, if so return error ErrMachineConfigBusy + if mcp.Spec.Configuration.Name != mcp.Status.Configuration.Name { + return fmt.Errorf("failed to pause MachineConfigPool. %s is in the middle of a configuration change. %w", mcpName, ErrMachineConfigBusy) + } + + // pause the MachineConfigPool + err = o.changeMachineConfigPoolPause(ctx, mcp, true) + if err != nil { + return fmt.Errorf("failed to pause MachineConfigPool %s: %w", mcpName, err) + } + + // wait for pause annotation to be reflected back in cache + err = waitForObjectUpdate(ctx, o.client, mcp, func(obj client.Object) bool { + return obj.GetAnnotations()[McpPausedAnnotKey] == McpPausedAnnotValue + }) + if err != nil { + return fmt.Errorf("failed to pause MachineConfigPool %s. failed while waiting for annotation to update: %w", mcpName, err) + } + + // check if machine config is started doing something, if so, undo operation and return error + if mcp.Spec.Configuration.Name != mcp.Status.Configuration.Name { + err = o.changeMachineConfigPoolPause(ctx, mcp, false) + if err != nil { + return fmt.Errorf("failed to pause MachineConfigPool.%s is configuring, failed to unpause it: %w", mcpName, err) + } + // wait for pause annotation removal to be reflected back in cache + err = waitForObjectUpdate(ctx, o.client, mcp, func(obj client.Object) bool { + _, ok := obj.GetAnnotations()[McpPausedAnnotKey] + return !ok + }) + if err != nil { + return fmt.Errorf("failed to pause MachineConfigPool %s. failed while waiting for annotation to be removed: %w", mcpName, err) + } + + return fmt.Errorf("failed to pause MachineConfigPool. %s is in the middle of a configuration change. %w", mcpName, ErrMachineConfigBusy) + } + + return nil +} + +func (o *openshiftMcpManager) UnpauseMCP(ctx context.Context, node *corev1.Node, nm *maintenancev1.NodeMaintenance) error { + o.mu.Lock() + defer o.mu.Unlock() + + // get MCP + mcpName, err := o.getMCPName(ctx, node) + if err != nil { + return fmt.Errorf("failed to unpause MachineConfigPool. failed to get pool for node %s: %w", node.Name, err) + } + + mcp := &mcv1.MachineConfigPool{} + err = o.client.Get(ctx, client.ObjectKey{Name: mcpName}, mcp) + if err != nil { + return fmt.Errorf("failed to unpause MachineConfigPool. failed to get MachineConfigPool %s: %w", mcpName, err) + } + + // if mcp not paused by operator, return. + // this can happen if node changes MCP while under maintenance. + if !metav1.HasAnnotation(mcp.ObjectMeta, McpPausedAnnotKey) { + return nil + } + + // check if MCP is used by other NodeMaintenance + used, err := o.mcpUsedByOtherNodeMaintenance(ctx, mcp, nm) + if err != nil { + return fmt.Errorf("failed to unpause MachineConfigPool %s, failed to check if it is used by other nodeMaintenances: %w", mcpName, err) + } + + // if not used, we can unpause MCP + if !used { + err = o.changeMachineConfigPoolPause(ctx, mcp, false) + if err != nil { + return fmt.Errorf("failed to unpause MachineConfigPool %s: %w", mcpName, err) + } + + // wait for unpause annotation to be reflected back in cache + err = waitForObjectUpdate(ctx, o.client, mcp, func(obj client.Object) bool { + _, ok := obj.GetAnnotations()[McpPausedAnnotKey] + return !ok + }) + if err != nil { + return fmt.Errorf("failed to unpause MachineConfigPool %s. failed while waiting for annotation to be removed: %w", mcpName, err) + } + } + return nil +} + +// getMCPName returns the MachineConfigPool name for the given node +func (o *openshiftMcpManager) getMCPName(ctx context.Context, node *corev1.Node) (string, error) { + // To get the MachineConfigPool for the node, we do the following: + // 1. get the desired MachineConfig from the node annotation + // 2. find the owning MachineConfigPool for the MachineConfig we found in step 1 + desiredConfig, ok := node.Annotations[DesiredMachineConfigAnnotationKey] + if !ok { + return "", fmt.Errorf("failed to get MachineConfig, desired machine config annotation for node %s not found", node.Name) + } + + mc := &mcv1.MachineConfig{} + err := o.client.Get(ctx, client.ObjectKey{Name: desiredConfig}, mc) + if err != nil { + return "", fmt.Errorf("failed to get the the node's MachineConfig object with name %s. %w", desiredConfig, err) + } + for _, owner := range mc.OwnerReferences { + if owner.Kind == "MachineConfigPool" { + return owner.Name, nil + } + } + return "", fmt.Errorf("failed to find the MachineConfigPool of the node, no owner for the node's MachineConfig") +} + +// changeMachineConfigPoolPause pauses/unpauses the MachineConfigPool +func (o *openshiftMcpManager) changeMachineConfigPoolPause(ctx context.Context, mcp *mcv1.MachineConfigPool, pause bool) error { + // create merge patch to pause/unpause and annotate/un-annotate the machine config pool + var patch []byte + if pause { + patch = []byte(fmt.Sprintf(`{"spec":{"paused":true},"metadata":{"annotations":{"%s":"%s"}}}`, McpPausedAnnotKey, McpPausedAnnotValue)) + } else { + patch = []byte(fmt.Sprintf(`{"spec":{"paused": false},"metadata":{"annotations":{"%s": null}}}`, McpPausedAnnotKey)) + } + + err := o.client.Patch(ctx, mcp, client.RawPatch(types.MergePatchType, patch)) + if err != nil { + return fmt.Errorf("failed to patch MachineConfigPool %s to pause=%t: %w", mcp.Name, pause, err) + } + + return nil +} + +// mcpUsedByOtherNodeMaintenance return true if the MCP is used by other Nodes which have NodeMaintenance associated with them that already paused MCP +func (o *openshiftMcpManager) mcpUsedByOtherNodeMaintenance(ctx context.Context, mcp *mcv1.MachineConfigPool, nm *maintenancev1.NodeMaintenance) (bool, error) { + // get all nodes that match the pool + nodesInPool := &corev1.NodeList{} + selector, err := metav1.LabelSelectorAsSelector(mcp.Spec.NodeSelector) + if err != nil { + return false, err + } + + err = o.client.List(ctx, nodesInPool, &client.ListOptions{LabelSelector: selector}) + if err != nil { + return false, err + } + + nodesInPoolMap := make(map[string]bool) + for _, n := range nodesInPool.Items { + nodesInPoolMap[n.Name] = true + } + + nml := &maintenancev1.NodeMaintenanceList{} + err = o.client.List(ctx, nml) + if err != nil { + return false, err + } + + // check that no other NodeMaintenance is using the MCP + for _, nmItem := range nml.Items { + // if nodeMaintenance is not in the pool, skip + if !nodesInPoolMap[nmItem.Spec.NodeName] { + continue + } + // if MCP name label is not set on NodeMaintenance, skip (PauseMCP was not called for this NodeMaintenance) + if nmItem.Labels[McpNameLabelKey] == "" { + continue + } + + // if its the current nodeMaintenance, skip + if nmItem.Namespace == nm.Namespace && + nmItem.Name == nm.Name { + continue + } + + // mcp is used by other NodeMaintenance + return true, nil + } + + return false, nil +} + +// waitForObjectUpdate polls k8s using the provided client, gets the given object and checks if the object satisfies the checkFn +// checkFn should return true if the object satisfies the condition, false otherwise. +// this is useful when we need to wait for an object to be updated in the client cache +func waitForObjectUpdate(ctx context.Context, kclient client.Client, obj client.Object, checkFn func(o client.Object) bool) error { + return wait.PollUntilContextTimeout(ctx, 250*time.Millisecond, 10*time.Second, true, func(ctx context.Context) (bool, error) { + err := kclient.Get(ctx, client.ObjectKeyFromObject(obj), obj) + if err != nil { + return false, err + } + + return checkFn(obj), nil + }) +} diff --git a/internal/openshift/mcp_test.go b/internal/openshift/mcp_test.go new file mode 100644 index 0000000..0c74c1f --- /dev/null +++ b/internal/openshift/mcp_test.go @@ -0,0 +1,279 @@ +/* + Copyright 2025, NVIDIA CORPORATION & AFFILIATES + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package openshift_test + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + mcv1 "github.com/openshift/api/machineconfiguration/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + maintenancev1 "github.com/Mellanox/maintenance-operator/api/v1alpha1" + "github.com/Mellanox/maintenance-operator/internal/openshift" +) + +const ( + testNodeName = "test-node" + testMCPName = "test-mcp" + testMCName = "test-mc" + testNMame = "test-nm" +) + +var _ = Describe("mcp manager tests", func() { + Context("mcp no-op tests", func() { + It("should return correct values", func() { + testCtx := context.TODO() + noOpMcpManager := openshift.NewNoOpMcpManager() + Expect(noOpMcpManager.PauseMCP(testCtx, nil, nil)).To(Succeed()) + Expect(noOpMcpManager.UnpauseMCP(testCtx, nil, nil)).To(Succeed()) + }) + }) + + Context("mcp manager tests", func() { + var testClient client.Client + var testCtx context.Context + var mcpManager openshift.MCPManager + var node *corev1.Node + var nm *maintenancev1.NodeMaintenance + var mcp *mcv1.MachineConfigPool + var mc *mcv1.MachineConfig + + BeforeEach(func() { + testCtx = context.TODO() + s := runtime.NewScheme() + mcv1.Install(s) + corev1.AddToScheme(s) + maintenancev1.AddToScheme(s) + + node = getTestNode() + nm = getTestNM() + mcp = getTestMCP() + mc = getTestMC() + testClient = fake.NewClientBuilder(). + WithScheme(s). + WithObjects(node, mc, mcp, nm). + WithStatusSubresource(&mcv1.MachineConfigPool{}). + Build() + mcpManager = openshift.NewOpenshiftMcpManager(testClient) + }) + + Context("PauseMCP", func() { + It("Succeeds if not paused by the operator", func() { + Expect(mcpManager.PauseMCP(testCtx, node, nm)).To(Succeed()) + + // validate mcp + Expect(testClient.Get(testCtx, types.NamespacedName{Name: testMCPName}, mcp)).To(Succeed()) + Expect(mcp.Spec.Paused).To(BeTrue()) + Expect(mcp.Annotations[openshift.McpPausedAnnotKey]).To(Equal(openshift.McpPausedAnnotValue)) + + // validate node maintenance + Expect(testClient.Get(testCtx, types.NamespacedName{Name: testNMame}, nm)).To(Succeed()) + Expect(nm.Labels[openshift.McpNameLabelKey]).To(Equal(testMCPName)) + }) + + It("Succeed if already paused by the operator", func() { + Expect(mcpManager.PauseMCP(testCtx, node, nm)).To(Succeed()) + Expect(mcpManager.PauseMCP(testCtx, node, nm)).To(Succeed()) + }) + + It("Fails if paused by someone else", func() { + mcp.Spec.Paused = true + Expect(testClient.Update(testCtx, mcp)).To(Succeed()) + Expect(mcpManager.PauseMCP(testCtx, node, nm)).To(MatchError(openshift.ErrMachineConfigBusy)) + + // validate mcp + Expect(testClient.Get(testCtx, types.NamespacedName{Name: testMCPName}, mcp)).To(Succeed()) + Expect(mcp.Spec.Paused).To(BeTrue()) + Expect(mcp.Annotations).ToNot(HaveKey(openshift.McpPausedAnnotKey)) + + // validate node maintenance + Expect(testClient.Get(testCtx, types.NamespacedName{Name: testNMame}, nm)).To(Succeed()) + Expect(nm.Labels).To(HaveKey(openshift.McpNameLabelKey)) + }) + + It("Fails if MCP is under configuration", func() { + mcp.Status.Configuration.Name = "other-config" + Expect(testClient.Status().Update(testCtx, mcp)).To(Succeed()) + err := mcpManager.PauseMCP(testCtx, node, nm) + Expect(err).To(MatchError(openshift.ErrMachineConfigBusy)) + + // validate mcp + Expect(testClient.Get(testCtx, types.NamespacedName{Name: testMCPName}, mcp)).To(Succeed()) + Expect(mcp.Spec.Paused).To(BeFalse()) + Expect(mcp.Annotations).ToNot(HaveKey(openshift.McpPausedAnnotKey)) + + // validate node maintenance + Expect(testClient.Get(testCtx, types.NamespacedName{Name: testNMame}, nm)).To(Succeed()) + Expect(nm.Labels).To(HaveKey(openshift.McpNameLabelKey)) + }) + + It("Fails if MCP annotation on node is not present", func() { + node.Annotations = map[string]string{} + Expect(testClient.Update(testCtx, node)).To(Succeed()) + Expect(mcpManager.PauseMCP(testCtx, node, nm)).ToNot(Succeed()) + }) + + It("Fails if failed to get MCP name", func() { + mc.OwnerReferences = nil + Expect(testClient.Update(testCtx, mc)).To(Succeed()) + Expect(mcpManager.PauseMCP(testCtx, node, nm)).ToNot(Succeed()) + Expect(testClient.Delete(testCtx, mc)).To(Succeed()) + Expect(mcpManager.PauseMCP(testCtx, node, nm)).ToNot(Succeed()) + }) + + It("fails if failed to get MCP", func() { + Expect(testClient.Delete(testCtx, mcp)).To(Succeed()) + Expect(mcpManager.PauseMCP(testCtx, node, nm)).ToNot(Succeed()) + }) + }) + + Context("UnpauseMCP", func() { + It("Succeeds if paused by the operator", func() { + Expect(mcpManager.PauseMCP(testCtx, node, nm)).To(Succeed()) + Expect(testClient.Get(testCtx, client.ObjectKeyFromObject(mcp), mcp)).To(Succeed()) + Expect(mcp.Spec.Paused).To(BeTrue()) + Expect(mcpManager.UnpauseMCP(testCtx, node, nm)).To(Succeed()) + Expect(testClient.Get(testCtx, client.ObjectKeyFromObject(mcp), mcp)).To(Succeed()) + Expect(mcp.Spec.Paused).To(BeFalse()) + Expect(mcp.Annotations).ToNot(HaveKey(openshift.McpPausedAnnotKey)) + }) + + It("Succeeds if not paused by the operator, mcp remains paused", func() { + mcp.Spec.Paused = true + Expect(testClient.Update(testCtx, mcp)).To(Succeed()) + Expect(mcpManager.UnpauseMCP(testCtx, node, nm)).To(Succeed()) + + Expect(testClient.Get(testCtx, client.ObjectKeyFromObject(mcp), mcp)).To(Succeed()) + Expect(mcp.Spec.Paused).To(BeTrue()) + }) + + It("Succeeds - multiple nodeMaintenance same pool", func() { + // create another node that belongs to same pool and another node maintenance referenccing the new node + nm2 := getTestNM() + nm2.Name = "test-nm2" + nm2.Spec.NodeName = "test-node2" + Expect(testClient.Create(testCtx, nm2)).To(Succeed()) + + node2 := getTestNode() + node2.Name = "test-node2" + Expect(testClient.Create(testCtx, node2)).To(Succeed()) + + // call pause on both nodes/nm (pausing the same machine config pool) + Expect(mcpManager.PauseMCP(testCtx, node, nm)).To(Succeed()) + Expect(mcpManager.PauseMCP(testCtx, node2, nm2)).To(Succeed()) + + // unpause one of the nodes + Expect(mcpManager.UnpauseMCP(testCtx, node, nm)).To(Succeed()) + // should not affect MCP + Expect(testClient.Get(testCtx, client.ObjectKeyFromObject(mcp), mcp)).To(Succeed()) + Expect(mcp.Spec.Paused).To(BeTrue()) + Expect(mcp.Annotations[openshift.McpPausedAnnotKey]).To(Equal(openshift.McpPausedAnnotValue)) + + // delete the first node maintenance and call unpause with the other + Expect(testClient.Delete(testCtx, nm)).To(Succeed()) + Expect(mcpManager.UnpauseMCP(testCtx, node, nm2)).To(Succeed()) + // should unpause machine config pool as its the last node maintenance referencing it + Expect(testClient.Get(testCtx, client.ObjectKeyFromObject(mcp), mcp)).To(Succeed()) + Expect(mcp.Spec.Paused).To(BeFalse()) + Expect(mcp.Annotations).ToNot(HaveKey(openshift.McpPausedAnnotKey)) + }) + + It("fails if failed to get MCP name", func() { + mc.OwnerReferences = nil + Expect(testClient.Update(testCtx, mc)).To(Succeed()) + Expect(mcpManager.UnpauseMCP(testCtx, node, nm)).ToNot(Succeed()) + Expect(testClient.Delete(testCtx, mc)).To(Succeed()) + Expect(mcpManager.UnpauseMCP(testCtx, node, nm)).ToNot(Succeed()) + }) + + It("fails if failed to get MCP", func() { + Expect(testClient.Delete(testCtx, mcp)).ToNot(HaveOccurred()) + Expect(mcpManager.UnpauseMCP(testCtx, node, nm)).ToNot(Succeed()) + }) + }) + }) +}) + +func getTestNode() *corev1.Node { + return &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNodeName, + Annotations: map[string]string{ + openshift.DesiredMachineConfigAnnotationKey: testMCName, + }, + Labels: map[string]string{ + "mcp-pool": testMCName, + }, + }, + } +} + +func getTestMCP() *mcv1.MachineConfigPool { + return &mcv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: testMCPName, + Annotations: map[string]string{ + "foo": "bar", + }, + }, + Spec: mcv1.MachineConfigPoolSpec{ + Paused: false, + Configuration: mcv1.MachineConfigPoolStatusConfiguration{ + ObjectReference: corev1.ObjectReference{Name: testMCName}, + }, + NodeSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"mcp-pool": testMCName}, + }, + }, + Status: mcv1.MachineConfigPoolStatus{ + Configuration: mcv1.MachineConfigPoolStatusConfiguration{ + ObjectReference: corev1.ObjectReference{Name: testMCName}, + }, + }, + } +} + +func getTestMC() *mcv1.MachineConfig { + return &mcv1.MachineConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: testMCName, + OwnerReferences: []metav1.OwnerReference{ + {Kind: "MachineConfigPool", Name: testMCPName}, + }, + }, + Spec: mcv1.MachineConfigSpec{}, + } +} + +func getTestNM() *maintenancev1.NodeMaintenance { + return &maintenancev1.NodeMaintenance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNMame, + }, + Spec: maintenancev1.NodeMaintenanceSpec{ + RequestorID: "foo.bar.baz", + NodeName: testNodeName, + }, + } +} diff --git a/internal/openshift/openshift_suite_test.go b/internal/openshift/openshift_suite_test.go new file mode 100644 index 0000000..647a360 --- /dev/null +++ b/internal/openshift/openshift_suite_test.go @@ -0,0 +1,32 @@ +/* + Copyright 2025, NVIDIA CORPORATION & AFFILIATES + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package openshift_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestOpenshift(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "openshift test Suite") +} + +var _ = BeforeSuite(func() { +}) diff --git a/internal/openshift/utils.go b/internal/openshift/utils.go new file mode 100644 index 0000000..87d494e --- /dev/null +++ b/internal/openshift/utils.go @@ -0,0 +1,71 @@ +/* + Copyright 2025, NVIDIA CORPORATION & AFFILIATES + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package openshift + +import ( + "context" + "fmt" + + ocpconfigv1 "github.com/openshift/api/config/v1" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// OpenshiftUtils provides utility functions for openshift +type OpenshiftUtils interface { + // IsOpenshift returns true if the cluster is openshift + IsOpenshift() bool + // IsHypershift returns true if the cluster is hypershift (openshift with extenal control plane nodes) + IsHypershift() bool +} + +// openshiftUtils implements OpenshiftUtils +type openshiftUtils struct { + isOpenshift bool + isHypershift bool +} + +// NewOpenshiftUtils returns a new OpenshiftUtils +func NewOpenshiftUtils(ctx context.Context, client client.Reader) (OpenshiftUtils, error) { + ocpUtils := &openshiftUtils{} + + // check if the cluster is openshift by getting the Infrastructure resource + infra := &ocpconfigv1.Infrastructure{} + err := client.Get(ctx, types.NamespacedName{Name: InfraResourceName}, infra) + if err != nil { + if !meta.IsNoMatchError(err) { + return nil, fmt.Errorf("can't detect cluster type, cant get infrastructure resource (name: %s): %v", InfraResourceName, err) + } + // not an openshift cluster + return ocpUtils, nil + } + + ocpUtils.isOpenshift = true + // if control plane topology is external, it's hypershift. + ocpUtils.isHypershift = infra.Status.ControlPlaneTopology == ocpconfigv1.ExternalTopologyMode + + return ocpUtils, nil +} + +func (o *openshiftUtils) IsOpenshift() bool { + return o.isOpenshift +} + +func (o *openshiftUtils) IsHypershift() bool { + return o.isHypershift +} diff --git a/internal/openshift/utils_test.go b/internal/openshift/utils_test.go new file mode 100644 index 0000000..ab03e70 --- /dev/null +++ b/internal/openshift/utils_test.go @@ -0,0 +1,92 @@ +/* + Copyright 2025, NVIDIA CORPORATION & AFFILIATES + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package openshift_test + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + ocpconfigv1 "github.com/openshift/api/config/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrfake "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/Mellanox/maintenance-operator/internal/openshift" +) + +var _ = Describe("openshift utils tests", func() { + var testCtx context.Context + var cFunc context.CancelFunc + var s *runtime.Scheme + + BeforeEach(func() { + testCtx, cFunc = context.WithCancel(context.TODO()) + s = runtime.NewScheme() + }) + + AfterEach(func() { + cFunc() + }) + + Context("OpenshiftUtils when cluster is openshift", func() { + It("should return correct values", func() { + ocpconfigv1.Install(s) + fakeClient := ctrfake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&ocpconfigv1.Infrastructure{}). + WithObjects(getInfrastructureResource(false)). + Build() + ocpUtils, err := openshift.NewOpenshiftUtils(testCtx, fakeClient) + Expect(err).ToNot(HaveOccurred()) + Expect(ocpUtils.IsOpenshift()).To(BeTrue()) + Expect(ocpUtils.IsHypershift()).To(BeFalse()) + }) + }) + + Context("OpenshiftUtils when cluster is hypershift", func() { + It("should return correct values", func() { + ocpconfigv1.Install(s) + fakeClient := ctrfake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&ocpconfigv1.Infrastructure{}). + WithObjects(getInfrastructureResource(true)). + Build() + ocpUtils, err := openshift.NewOpenshiftUtils(testCtx, fakeClient) + Expect(err).ToNot(HaveOccurred()) + Expect(ocpUtils.IsOpenshift()).To(BeTrue()) + Expect(ocpUtils.IsHypershift()).To(BeTrue()) + }) + }) +}) + +func getInfrastructureResource(isHypershift bool) *ocpconfigv1.Infrastructure { + cplaneTopology := ocpconfigv1.SingleReplicaTopologyMode + if isHypershift { + cplaneTopology = ocpconfigv1.ExternalTopologyMode + } + + return &ocpconfigv1.Infrastructure{ + ObjectMeta: metav1.ObjectMeta{ + Name: openshift.InfraResourceName, + }, + Spec: ocpconfigv1.InfrastructureSpec{}, + Status: ocpconfigv1.InfrastructureStatus{ + ControlPlaneTopology: cplaneTopology, + }, + } +}