Skip to content

Commit

Permalink
Integrate Maintenance operator to schedule maintenance requests
Browse files Browse the repository at this point in the history
Signed-off-by: amaslennikov <amaslennikov@nvidia.com>
  • Loading branch information
almaslennikov committed Sep 21, 2024
1 parent ec4be7c commit 364c8bb
Show file tree
Hide file tree
Showing 11 changed files with 219 additions and 14 deletions.
6 changes: 4 additions & 2 deletions cmd/nic-configuration-daemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"flag"
"os"

maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
Expand All @@ -28,6 +29,7 @@ func main() {
ncolog.InitLog()

utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(maintenanceoperator.AddToScheme(scheme))
utilruntime.Must(v1alpha1.AddToScheme(scheme))

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Expand All @@ -53,7 +55,7 @@ func main() {

hostUtils := host.NewHostUtils()
hostManager := host.NewHostManager(nodeName, hostUtils)
maintenanceManager := maintenance.New()
maintenanceManager := maintenance.New(mgr.GetClient(), hostUtils, nodeName, namespace)

deviceDiscovery := controller.NewDeviceRegistry(mgr.GetClient(), hostManager, nodeName, namespace)
if err = mgr.Add(deviceDiscovery); err != nil {
Expand All @@ -69,7 +71,7 @@ func main() {
HostManager: hostManager,
MaintenanceManager: maintenanceManager,
}
err = nicDeviceReconciler.SetupWithManager(mgr)
err = nicDeviceReconciler.SetupWithManager(mgr, true)
if err != nil {
log.Log.Error(err, "unable to create controller", "controller", "NicDeviceReconciler")
os.Exit(1)
Expand Down
12 changes: 12 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,15 @@ rules:
- get
- patch
- update
- apiGroups:
- maintenance.nvidia.com
resources:
- nodemaintenances
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.22.0
toolchain go1.22.4

require (
github.com/Mellanox/maintenance-operator/api v0.0.0-20240916123230-810ab7bb25f4
github.com/Mellanox/rdmamap v1.1.0
github.com/jaypipes/ghw v0.12.0
github.com/jaypipes/pcidb v1.0.1
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
github.com/Mellanox/maintenance-operator/api v0.0.0-20240916123230-810ab7bb25f4 h1:XTyFEogTo9v/lZXMqKroHSpVimDxYOHvTdwScJHA7v0=
github.com/Mellanox/maintenance-operator/api v0.0.0-20240916123230-810ab7bb25f4/go.mod h1:5OIBO4beWexC3JvLIH1GGNzr49QW7UoZe2LgT/IXYIc=
github.com/Mellanox/rdmamap v1.1.0 h1:A/W1wAXw+6vm58f3VklrIylgV+eDJlPVIMaIKuxgUT4=
github.com/Mellanox/rdmamap v1.1.0/go.mod h1:fN+/V9lf10ABnDCwTaXRjeeWijLt2iVLETnK+sx/LY8=
github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
Expand Down
1 change: 1 addition & 0 deletions internal/controller/nicconfigurationtemplate_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ type NicConfigurationTemplateReconciler struct {
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch
//+kubebuilder:rbac:groups="",resources=pods,verbs=list
//+kubebuilder:rbac:groups="",resources=pods/eviction,verbs=create;delete;get;list;patch;update;watch
//+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances,verbs=get;list;watch;create;update;patch;delete

// Reconcile reconciles the NicConfigurationTemplate object
func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
Expand Down
28 changes: 25 additions & 3 deletions internal/controller/nicdevice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"sync"
"time"

maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
Expand Down Expand Up @@ -473,7 +474,7 @@ func (r *NicDeviceReconciler) updateDeviceStatusCondition(ctx context.Context, d
}

// SetupWithManager sets up the controller with the Manager.
func (r *NicDeviceReconciler) SetupWithManager(mgr ctrl.Manager) error {
func (r *NicDeviceReconciler) SetupWithManager(mgr ctrl.Manager, watchForMaintenance bool) error {
qHandler := func(q workqueue.TypedRateLimitingInterface[reconcile.Request]) {
q.Add(reconcile.Request{NamespacedName: k8sTypes.NamespacedName{
Namespace: "",
Expand Down Expand Up @@ -516,9 +517,30 @@ func (r *NicDeviceReconciler) SetupWithManager(mgr ctrl.Manager) error {
},
}

return ctrl.NewControllerManagedBy(mgr).
controller := ctrl.NewControllerManagedBy(mgr).
For(&v1alpha1.NicDevice{}).
Watches(&v1alpha1.NicDevice{}, eventHandler).
Watches(&v1alpha1.NicDevice{}, eventHandler)

if watchForMaintenance {
maintenanceEventHandler := handler.Funcs{
// We only want status update events
UpdateFunc: func(ctx context.Context, e event.UpdateEvent, q workqueue.TypedRateLimitingInterface[reconcile.Request]) {
nm := e.ObjectNew.(*maintenanceoperator.NodeMaintenance)

if nm.Spec.RequestorID != consts.MaintenanceRequestor || nm.Spec.NodeName != r.NodeName {
// We want to skip event from maintenance not on the current node or not scheduled by us
return
}

log.Log.Info("Enqueuing sync for maintenance update event", "resource", e.ObjectNew.GetName())
qHandler(q)
},
}

controller.Watches(&maintenanceoperator.NodeMaintenance{}, maintenanceEventHandler)
}

return controller.
Named("nicDeviceReconciler").
Complete(r)
}
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/nicdevice_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ var _ = Describe("NicDeviceReconciler", func() {
HostManager: hostManager,
MaintenanceManager: maintenanceManager,
}
Expect(reconciler.SetupWithManager(mgr)).To(Succeed())
Expect(reconciler.SetupWithManager(mgr, false)).To(Succeed())
})

AfterEach(func() {
Expand Down
5 changes: 5 additions & 0 deletions pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,9 @@ const (
SecondPortPrefix = "P2"

EnvBaremetal = "Baremetal"

MaintenanceRequestor = "configuration.nic.mellanox.com"
MaintenanceRequestName = "nic-configuration-operator-maintenance"

HostPath = "/host"
)
18 changes: 18 additions & 0 deletions pkg/host/mocks/HostUtils.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

43 changes: 43 additions & 0 deletions pkg/host/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"regexp"
"strconv"
"strings"
"syscall"

"github.com/Mellanox/rdmamap"
"github.com/jaypipes/ghw"
Expand Down Expand Up @@ -75,6 +76,8 @@ type HostUtils interface {
SetMaxReadRequestSize(pciAddr string, maxReadRequestSize int) error
// SetTrustAndPFC sets trust and PFC settings for a network interface
SetTrustAndPFC(interfaceName string, trust string, pfc string) error
// ScheduleReboot schedules reboot on the host
ScheduleReboot() error
}

type hostUtils struct {
Expand Down Expand Up @@ -562,6 +565,46 @@ func (h *hostUtils) SetTrustAndPFC(interfaceName string, trust string, pfc strin
return nil
}

func (h *hostUtils) ScheduleReboot() error {
log.Log.Info("HostUtils.ScheduleReboot()")
root, err := os.Open("/")
if err != nil {
log.Log.Error(err, "ScheduleReboot(): Failed to os.Open")
return err
}

if err := syscall.Chroot(consts.HostPath); err != nil {
err := root.Close()
if err != nil {
log.Log.Error(err, "ScheduleReboot(): Failed to syscall.Chroot")
return err
}
return err
}

defer func() {
if err := root.Close(); err != nil {
log.Log.Error(err, "ScheduleReboot(): Failed to os.Close")
return
}
if err := root.Chdir(); err != nil {
log.Log.Error(err, "ScheduleReboot(): Failed to os.Chdir")
return
}
if err = syscall.Chroot("."); err != nil {
log.Log.Error(err, "ScheduleReboot(): Failed to syscall.Chroot")
}
}()

cmd := h.execInterface.Command("shutdown", "-r", "now")
_, err = cmd.Output()
if err != nil {
log.Log.Error(err, "ScheduleReboot(): Failed to run shutdown -r now")
return err
}
return nil
}

func NewHostUtils() HostUtils {
return &hostUtils{execInterface: execUtils.New()}
}
115 changes: 107 additions & 8 deletions pkg/maintenance/maintenancemanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@ package maintenance
import (
"context"

maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1"
"github.com/Mellanox/nic-configuration-operator/pkg/host"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"

"github.com/Mellanox/nic-configuration-operator/pkg/consts"
)

type MaintenanceManager interface {
Expand All @@ -28,32 +35,124 @@ type MaintenanceManager interface {
Reboot() error
}

type maintenanceManager struct{}
type maintenanceManager struct {
client client.Client
hostUtils host.HostUtils
nodeName string
namespace string
}

func (m maintenanceManager) getNodeMaintenanceObject(ctx context.Context) (*maintenanceoperator.NodeMaintenance, error) {
list := maintenanceoperator.NodeMaintenanceList{}
err := m.client.List(ctx, &list, client.InNamespace(m.namespace))
if err != nil {
log.Log.Error(err, "failed to get node maintenance objects")
return nil, err
}

for _, obj := range list.Items {
if obj.Spec.RequestorID == consts.MaintenanceRequestor && obj.Spec.NodeName == m.nodeName {
return &obj, nil
}
}

return nil, nil
}

func (m maintenanceManager) ScheduleMaintenance(ctx context.Context) error {
log.Log.Info("maintenanceManager.ScheduleMaintenance()")
//TODO implement me

scheduledMaintenance, err := m.getNodeMaintenanceObject(ctx)
if err != nil {
log.Log.Error(err, "failed to schedule node maintenance")
return err
}

if scheduledMaintenance != nil {
// Maintenance already scheduled by us, nothing to do
return nil
}

maintenanceRequest := &maintenanceoperator.NodeMaintenance{
ObjectMeta: metav1.ObjectMeta{
Name: consts.MaintenanceRequestName,
Namespace: m.namespace,
},
Spec: maintenanceoperator.NodeMaintenanceSpec{
RequestorID: consts.MaintenanceRequestor,
AdditionalRequestors: nil,
NodeName: m.nodeName,
Cordon: true,
WaitForPodCompletion: nil,
DrainSpec: &maintenanceoperator.DrainSpec{
Force: true,
DeleteEmptyDir: true,
},
},
}

err = m.client.Create(ctx, maintenanceRequest)
if err != nil {
log.Log.Error(err, "failed to schedule node maintenance")
return err
}

return nil
}

func (m maintenanceManager) MaintenanceAllowed(ctx context.Context) (bool, error) {
log.Log.Info("maintenanceManager.MaintenanceAllowed()")
//TODO implement me
scheduledMaintenance, err := m.getNodeMaintenanceObject(ctx)
if err != nil {
log.Log.Error(err, "failed to get node maintenance")
return false, err
}

if scheduledMaintenance == nil {
// We want to perform maintenance on NICs only when node is properly prepared
return false, nil
}

readyCondition := meta.FindStatusCondition(scheduledMaintenance.Status.Conditions, maintenanceoperator.ConditionTypeReady)
if readyCondition == nil {
log.Log.V(2).Info("couldn't retrieve maintenance condition, retry")
return false, nil
}

if readyCondition.Status != metav1.ConditionTrue {
log.Log.V(2).Info("maintenance is not ready yet", "reason", readyCondition.Reason, "message", readyCondition.Message)
return false, nil
}

return true, nil
}

func (m maintenanceManager) ReleaseMaintenance(ctx context.Context) error {
log.Log.Info("maintenanceManager.ReleaseMaintenance()")
//TODO implement me

scheduledMaintenance, err := m.getNodeMaintenanceObject(ctx)
if err != nil {
log.Log.Error(err, "failed to get node maintenance")
return err
}

if scheduledMaintenance != nil {
err = m.client.Delete(ctx, scheduledMaintenance)
if err != nil {
log.Log.Error(err, "failed to release node maintenance")
return err
}
}

return nil
}

func (m maintenanceManager) Reboot() error {
log.Log.Info("maintenanceManager.Reboot()")
//TODO implement me
return nil

return m.hostUtils.ScheduleReboot()
}

func New() MaintenanceManager {
return maintenanceManager{}
func New(client client.Client, hostUtils host.HostUtils, nodeName string, namespace string) MaintenanceManager {
return maintenanceManager{client: client, hostUtils: hostUtils, nodeName: nodeName, namespace: namespace}
}

0 comments on commit 364c8bb

Please sign in to comment.