diff --git a/.golangci.yml b/.golangci.yml index ca69a11..5936305 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -16,6 +16,10 @@ issues: linters: - dupl - lll + - path: "pkg/*" + linters: + - dupl + - lll linters: disable-all: true enable: diff --git a/Dockerfile.nic-configuration-daemon b/Dockerfile.nic-configuration-daemon index ab660cd..6f39080 100644 --- a/Dockerfile.nic-configuration-daemon +++ b/Dockerfile.nic-configuration-daemon @@ -23,7 +23,31 @@ COPY ./ ./ RUN --mount=type=cache,target=/go/pkg/mod/ GO_GCFLAGS=${GCFLAGS} make build-daemon FROM quay.io/centos/centos:stream9 -RUN yum -y install hwdata mstflint && yum clean all +ARG TARGETARCH +ENV RHEL_VERSION=9.4 +ENV OFED_PACKAGE_MAJOR_VERSION=24.07 +ENV OFED_PACKAGE_MINOR_VERSION=0.6.1.0 +ENV MFT_VERSION=4.29.0-131 +ENV MLNX_TOOLS_VERSION=0.2407061 + +RUN yum -y install hwdata mstflint wget pciutils procps-ng kmod systemd && yum clean all + +RUN ARCH_SUFFIX="${TARGETARCH}" \ + && ARCH_SUFFIX="${ARCH_SUFFIX//amd64/x86_64}" \ + && ARCH_SUFFIX="${ARCH_SUFFIX//arm64/aarch64}" \ + && wget https://linux.mellanox.com/public/repo/mlnx_ofed/${OFED_PACKAGE_MAJOR_VERSION}-${OFED_PACKAGE_MINOR_VERSION}/rhel${RHEL_VERSION}/${ARCH_SUFFIX}/mlnx-tools-${OFED_PACKAGE_MAJOR_VERSION}-${MLNX_TOOLS_VERSION}.${ARCH_SUFFIX}.rpm \ + && rpm -i mlnx-tools-${OFED_PACKAGE_MAJOR_VERSION}-${MLNX_TOOLS_VERSION}.${ARCH_SUFFIX}.rpm \ + && rm mlnx-tools-${OFED_PACKAGE_MAJOR_VERSION}-${MLNX_TOOLS_VERSION}.${ARCH_SUFFIX}.rpm + +RUN ARCH_SUFFIX1="${TARGETARCH}" \ + && ARCH_SUFFIX1="${ARCH_SUFFIX1//amd64/x86_64}" \ + && ARCH_SUFFIX1="${ARCH_SUFFIX1//arm64/aarch64}" \ + && ARCH_SUFFIX2="${TARGETARCH}" \ + && ARCH_SUFFIX2="${ARCH_SUFFIX2//amd64/x86_64}" \ + && wget https://linux.mellanox.com/public/repo/mlnx_ofed/${OFED_PACKAGE_MAJOR_VERSION}-${OFED_PACKAGE_MINOR_VERSION}/rhel${RHEL_VERSION}/${ARCH_SUFFIX1}/mft-${MFT_VERSION}.${ARCH_SUFFIX2}.rpm \ + && rpm -i mft-${MFT_VERSION}.${ARCH_SUFFIX2}.rpm \ + && rm mft-${MFT_VERSION}.${ARCH_SUFFIX2}.rpm + WORKDIR / COPY --from=builder /workspace/build/nic-configuration-daemon . diff --git a/api/v1alpha1/nicconfigurationtemplate_types.go b/api/v1alpha1/nicconfigurationtemplate_types.go index 7530cd1..1fe5783 100644 --- a/api/v1alpha1/nicconfigurationtemplate_types.go +++ b/api/v1alpha1/nicconfigurationtemplate_types.go @@ -34,7 +34,7 @@ type NicSelectorSpec struct { // +enum type LinkTypeEnum string -// PciPerformanceOptimizedSpec specifies PCI performace optimization settings +// PciPerformanceOptimizedSpec specifies PCI performance optimization settings type PciPerformanceOptimizedSpec struct { // Specifies whether to enable PCI performance optimization Enabled bool `json:"enabled"` diff --git a/cmd/nic-configuration-daemon/main.go b/cmd/nic-configuration-daemon/main.go index 3bfdf6b..c6d426f 100644 --- a/cmd/nic-configuration-daemon/main.go +++ b/cmd/nic-configuration-daemon/main.go @@ -4,15 +4,18 @@ import ( "flag" "os" - "github.com/Mellanox/nic-configuration-operator/internal/controller" + maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1" "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" + "github.com/Mellanox/nic-configuration-operator/internal/controller" "github.com/Mellanox/nic-configuration-operator/pkg/host" + "github.com/Mellanox/nic-configuration-operator/pkg/maintenance" "github.com/Mellanox/nic-configuration-operator/pkg/ncolog" ) @@ -25,17 +28,9 @@ func main() { flag.Parse() ncolog.InitLog() - err := clientgoscheme.AddToScheme(scheme) - if err != nil { - log.Log.Error(err, "failed to load client-go to scheme") - os.Exit(1) - } - - err = v1alpha1.AddToScheme(scheme) - if err != nil { - log.Log.Error(err, "failed to load nic configuration CRDs to scheme") - os.Exit(1) - } + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(maintenanceoperator.AddToScheme(scheme)) + utilruntime.Must(v1alpha1.AddToScheme(scheme)) mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: scheme, @@ -59,7 +54,8 @@ func main() { } hostUtils := host.NewHostUtils() - hostManager := host.NewHostManager(hostUtils) + hostManager := host.NewHostManager(nodeName, hostUtils) + maintenanceManager := maintenance.New(mgr.GetClient(), hostUtils, nodeName, namespace) deviceDiscovery := controller.NewDeviceRegistry(mgr.GetClient(), hostManager, nodeName, namespace) if err = mgr.Add(deviceDiscovery); err != nil { @@ -67,6 +63,20 @@ func main() { os.Exit(1) } + nicDeviceReconciler := controller.NicDeviceReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + NodeName: nodeName, + NamespaceName: namespace, + HostManager: hostManager, + MaintenanceManager: maintenanceManager, + } + err = nicDeviceReconciler.SetupWithManager(mgr, true) + if err != nil { + log.Log.Error(err, "unable to create controller", "controller", "NicDeviceReconciler") + os.Exit(1) + } + ctx := ctrl.SetupSignalHandler() err = mgr.GetCache().IndexField(ctx, &v1alpha1.NicDevice{}, "status.node", func(o client.Object) []string { diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index dc4fcbe..d0d4f18 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -84,3 +84,15 @@ rules: - get - patch - update +- apiGroups: + - maintenance.nvidia.com + resources: + - nodemaintenances + verbs: + - create + - delete + - get + - list + - patch + - update + - watch diff --git a/docs/nic-configuration-reconcile-diagram.png b/docs/nic-configuration-reconcile-diagram.png new file mode 100755 index 0000000..35d9048 Binary files /dev/null and b/docs/nic-configuration-reconcile-diagram.png differ diff --git a/go.mod b/go.mod index 5ad7831..6cde7f8 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.22.0 toolchain go1.22.4 require ( + github.com/Mellanox/maintenance-operator/api v0.0.0-20240916123230-810ab7bb25f4 github.com/Mellanox/rdmamap v1.1.0 github.com/jaypipes/ghw v0.12.0 github.com/jaypipes/pcidb v1.0.1 diff --git a/go.sum b/go.sum index 912e5fe..85e4e6e 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/Mellanox/maintenance-operator/api v0.0.0-20240916123230-810ab7bb25f4 h1:XTyFEogTo9v/lZXMqKroHSpVimDxYOHvTdwScJHA7v0= +github.com/Mellanox/maintenance-operator/api v0.0.0-20240916123230-810ab7bb25f4/go.mod h1:5OIBO4beWexC3JvLIH1GGNzr49QW7UoZe2LgT/IXYIc= github.com/Mellanox/rdmamap v1.1.0 h1:A/W1wAXw+6vm58f3VklrIylgV+eDJlPVIMaIKuxgUT4= github.com/Mellanox/rdmamap v1.1.0/go.mod h1:fN+/V9lf10ABnDCwTaXRjeeWijLt2iVLETnK+sx/LY8= github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA= diff --git a/internal/controller/devicediscovery_controller.go b/internal/controller/devicediscovery_controller.go index 0d7ce54..8a81915 100644 --- a/internal/controller/devicediscovery_controller.go +++ b/internal/controller/devicediscovery_controller.go @@ -82,6 +82,8 @@ func (d *DeviceDiscovery) reconcile(ctx context.Context) error { return err } + log.Log.V(2).Info("listed devices", "devices", list.Items) + node := &v1.Node{} err = d.Client.Get(ctx, types.NamespacedName{Name: d.nodeName}, node) if err != nil { @@ -93,6 +95,7 @@ func (d *DeviceDiscovery) reconcile(ctx context.Context) error { observedDeviceStatus, exists := observedDevices[nicDeviceCR.Status.SerialNumber] if !exists { + log.Log.V(2).Info("device doesn't exist on the node anymore, deleting", "device", nicDeviceCR.Name) // Need to delete this CR, it doesn't represent the observedDevice on host anymore err = d.Client.Delete(ctx, &nicDeviceCR) if err != nil { @@ -106,6 +109,7 @@ func (d *DeviceDiscovery) reconcile(ctx context.Context) error { observedDeviceStatus.Conditions = nicDeviceCR.Status.Conditions if !reflect.DeepEqual(nicDeviceCR.Status, observedDeviceStatus) { + log.Log.V(2).Info("device status changed, updating", "device", nicDeviceCR.Name, "crStatus", nicDeviceCR.Status, "observedStatus", observedDeviceStatus) // Status of the device changes, need to update the CR nicDeviceCR.Status = observedDeviceStatus diff --git a/internal/controller/nicconfigurationtemplate_controller.go b/internal/controller/nicconfigurationtemplate_controller.go index 0bd55d9..4e42d60 100644 --- a/internal/controller/nicconfigurationtemplate_controller.go +++ b/internal/controller/nicconfigurationtemplate_controller.go @@ -56,6 +56,7 @@ type NicConfigurationTemplateReconciler struct { //+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch //+kubebuilder:rbac:groups="",resources=pods,verbs=list //+kubebuilder:rbac:groups="",resources=pods/eviction,verbs=create;delete;get;list;patch;update;watch +//+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances,verbs=get;list;watch;create;update;patch;delete // Reconcile reconciles the NicConfigurationTemplate object func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -89,11 +90,13 @@ func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req nodeMap := map[string]*v1.Node{} for _, node := range nodeList.Items { + node := node nodeMap[node.Name] = &node } templates := []*v1alpha1.NicConfigurationTemplate{} for _, template := range templateList.Items { + template := template templates = append(templates, &template) } @@ -108,7 +111,7 @@ func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req for _, template := range templates { if !deviceMatchesSelectors(&device, template, node) { - r.dropDeviceFromStatus(ctx, device.Name, template) + r.dropDeviceFromStatus(device.Name, template) continue } @@ -129,7 +132,7 @@ func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req if len(matchingTemplates) > 1 { for _, template := range matchingTemplates { - r.dropDeviceFromStatus(ctx, device.Name, template) + r.dropDeviceFromStatus(device.Name, template) } templateNames := []string{} @@ -171,7 +174,7 @@ func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req return ctrl.Result{}, nil } -func (r *NicConfigurationTemplateReconciler) dropDeviceFromStatus(ctx context.Context, deviceName string, template *v1alpha1.NicConfigurationTemplate) { +func (r *NicConfigurationTemplateReconciler) dropDeviceFromStatus(deviceName string, template *v1alpha1.NicConfigurationTemplate) { index := slices.Index(template.Status.NicDevices, deviceName) if index != -1 { // Device no longer matches template, drop it from the template's status diff --git a/internal/controller/nicdevice_controller.go b/internal/controller/nicdevice_controller.go new file mode 100644 index 0000000..adee848 --- /dev/null +++ b/internal/controller/nicdevice_controller.go @@ -0,0 +1,586 @@ +/* +2024 NVIDIA CORPORATION & AFFILIATES +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "sync" + "time" + + maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/runtime" + k8sTypes "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/json" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + v1alpha1 "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" + "github.com/Mellanox/nic-configuration-operator/pkg/consts" + "github.com/Mellanox/nic-configuration-operator/pkg/host" + "github.com/Mellanox/nic-configuration-operator/pkg/maintenance" + "github.com/Mellanox/nic-configuration-operator/pkg/types" +) + +const nicDeviceSyncEventName = "nic-device-sync-event-name" + +var requeueTime = 1 * time.Minute + +// NicDeviceReconciler reconciles a NicDevice object +type NicDeviceReconciler struct { + client.Client + Scheme *runtime.Scheme + + NodeName string + NamespaceName string + + HostManager host.HostManager + MaintenanceManager maintenance.MaintenanceManager +} + +type nicDeviceConfigurationStatuses []*nicDeviceConfigurationStatus + +type nicDeviceConfigurationStatus struct { + device *v1alpha1.NicDevice + nvConfigUpdateRequired bool + rebootRequired bool + lastStageError error +} + +// Reconcile reconciles the NicConfigurationTemplate object +func (r *NicDeviceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + configStatuses, err := r.getDevices(ctx) + if err != nil { + log.Log.Error(err, "failed to get devices to reconcile") + return ctrl.Result{}, err + } + + if len(configStatuses) == 0 { + err = r.MaintenanceManager.ReleaseMaintenance(ctx) + if err != nil { + log.Log.Error(err, "failed to release maintenance") + return ctrl.Result{}, err + } + // Nothing to reconcile + return ctrl.Result{}, nil + } + + err = r.handleSpecValidation(ctx, configStatuses) + if err != nil { + log.Log.Error(err, "failed to validate device's spec") + return ctrl.Result{}, err + } + + if configStatuses.nvConfigUpdateRequired() { + log.Log.V(2).Info("nv config update required, scheduling maintenance") + + result, err := r.ensureMaintenance(ctx) + if err != nil { + log.Log.V(2).Error(err, "failed to schedule maintenance") + return ctrl.Result{}, err + } + if result.Requeue || result.RequeueAfter != 0 { + return result, nil + } + + err = r.applyNvConfig(ctx, configStatuses) + if err != nil { + return ctrl.Result{}, err + } + } + + if configStatuses.rebootRequired() { + return r.handleReboot(ctx, configStatuses) + } + + if !configStatuses.nvConfigReadyForAll() { + return ctrl.Result{Requeue: true, RequeueAfter: requeueTime}, nil + } + + err = r.applyRuntimeConfig(ctx, configStatuses) + if err != nil { + return ctrl.Result{}, err + } + + if configStatuses.rebootRequired() { + return r.handleReboot(ctx, configStatuses) + } + + err = r.MaintenanceManager.ReleaseMaintenance(ctx) + if err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +func (r *NicDeviceReconciler) getDevices(ctx context.Context) (nicDeviceConfigurationStatuses, error) { + devices := &v1alpha1.NicDeviceList{} + + selectorFields := fields.OneTermEqualSelector("status.node", r.NodeName) + + err := r.Client.List(ctx, devices, &client.ListOptions{FieldSelector: selectorFields}) + if err != nil { + log.Log.Error(err, "failed to list NicDevice CRs") + return nil, err + } + + if len(devices.Items) == 0 { + err = r.MaintenanceManager.ReleaseMaintenance(ctx) + if err != nil { + log.Log.Error(err, "failed to release maintenance") + return nil, err + } + // Nothing to reconcile + return nil, nil + } + + configStatuses := nicDeviceConfigurationStatuses{} + + for i, device := range devices.Items { + if device.Spec.Configuration == nil { + continue + } + + configStatuses = append(configStatuses, &nicDeviceConfigurationStatus{ + device: &devices.Items[i], + }) + } + + return configStatuses, nil +} + +// ensureMaintenance schedules maintenance if required and requests reschedule if it's not ready yet +func (r *NicDeviceReconciler) ensureMaintenance(ctx context.Context) (ctrl.Result, error) { + err := r.MaintenanceManager.ScheduleMaintenance(ctx) + if err != nil { + log.Log.Error(err, "failed to schedule maintenance for node") + return ctrl.Result{}, err + } + + maintenanceAllowed, err := r.MaintenanceManager.MaintenanceAllowed(ctx) + if err != nil { + log.Log.Error(err, "failed to get maintenance status") + return ctrl.Result{}, err + } + if !maintenanceAllowed { + log.Log.V(2).Info("maintenance not allowed yet, exiting for now") + // Maintenance not yet allowed, waiting until then + return ctrl.Result{RequeueAfter: requeueTime}, nil + } + + return ctrl.Result{}, nil +} + +// applyRuntimeConfig applies each device's runtime spec in parallel +// if update is successful, applies status condition UpdateSuccessful, otherwise RuntimeConfigUpdateFailed +// if rebootRequired, sets status condition PendingReboot +// if status.rebootRequired == true, skips the device +// returns nil if all devices' config updates were successful, error otherwise +func (r *NicDeviceReconciler) applyRuntimeConfig(ctx context.Context, statuses nicDeviceConfigurationStatuses) error { + var wg sync.WaitGroup + + for i := 0; i < len(statuses); i++ { + wg.Add(1) + go func(index int) { + defer wg.Done() + + status := statuses[index] + status.lastStageError = nil + if status.rebootRequired { + return + } + + lastAppliedState, found := status.device.Annotations[consts.LastAppliedStateAnnotation] + if found { + specJson, err := json.Marshal(status.device.Spec) + if err != nil { + status.lastStageError = err + return + } + + if string(specJson) != lastAppliedState { + log.Log.V(2).Info("last applied state differs, reboot required", "device", status.device.Name) + status.rebootRequired = true + + err := r.updateDeviceStatusCondition(ctx, status.device, consts.PendingRebootReason, metav1.ConditionTrue, "") + if err != nil { + status.lastStageError = err + return + } + + return + } + } + + err := r.HostManager.ApplyDeviceRuntimeSpec(statuses[index].device) + if err != nil { + statuses[index].lastStageError = err + err = r.updateDeviceStatusCondition(ctx, status.device, consts.RuntimeConfigUpdateFailedReason, metav1.ConditionFalse, err.Error()) + if err != nil { + log.Log.Error(err, "failed to update device status condition", "device", status.device.Name) + } + return + } + + specJson, err := json.Marshal(status.device.Spec) + if err != nil { + status.lastStageError = err + return + } + + if status.device.Annotations == nil { + status.device.SetAnnotations(make(map[string]string)) + } + status.device.Annotations[consts.LastAppliedStateAnnotation] = string(specJson) + err = r.Update(ctx, status.device) + if err != nil { + status.lastStageError = err + return + } + + err = r.updateDeviceStatusCondition(ctx, status.device, consts.UpdateSuccessfulReason, metav1.ConditionFalse, "") + if err != nil { + status.lastStageError = err + return + } + }(i) + } + + wg.Wait() + + for _, status := range statuses { + if status.lastStageError != nil { + return status.lastStageError + } + } + + return nil +} + +// handleReboot schedules maintenance and reboots the node if maintenance is allowed +// Before rebooting the node, strips LastAppliedState annotations from all devices +// returns true if requeue of the reconcile request is required, false otherwise +// return err if encountered an error while performing maintenance scheduling / reboot +func (r *NicDeviceReconciler) handleReboot(ctx context.Context, statuses nicDeviceConfigurationStatuses) (ctrl.Result, error) { + err := r.MaintenanceManager.ScheduleMaintenance(ctx) + if err != nil { + log.Log.Error(err, "failed to schedule maintenance for node") + return ctrl.Result{}, err + } + + maintenanceAllowed, err := r.MaintenanceManager.MaintenanceAllowed(ctx) + if err != nil { + log.Log.Error(err, "failed to get maintenance status") + return ctrl.Result{}, err + } + if !maintenanceAllowed { + // Maintenance not yet allowed, waiting until then + return ctrl.Result{RequeueAfter: requeueTime}, nil + } + + // We need to strip last applied state annotation before reboot as it resets the runtime configuration + err = r.stripLastAppliedStateAnnotations(ctx, statuses) + if err != nil { + return ctrl.Result{}, err + } + + err = r.MaintenanceManager.Reboot() + if err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +// stripLastAppliedStateAnnotations deletes the consts.LastAppliedStateAnnotation from each device in parallel +// returns error if at least one annotation update failed +func (r *NicDeviceReconciler) stripLastAppliedStateAnnotations(ctx context.Context, statuses nicDeviceConfigurationStatuses) error { + var wg sync.WaitGroup + + for i := 0; i < len(statuses); i++ { + wg.Add(1) + go func(index int) { + defer wg.Done() + + status := statuses[index] + status.lastStageError = nil + + annotations := status.device.GetAnnotations() + if _, found := annotations[consts.LastAppliedStateAnnotation]; !found { + return + } + + delete(annotations, consts.LastAppliedStateAnnotation) + status.device.SetAnnotations(annotations) + status.lastStageError = r.Update(ctx, status.device) + }(i) + } + + wg.Wait() + + for _, status := range statuses { + if status.lastStageError != nil { + return status.lastStageError + } + } + + return nil +} + +// applyNvConfig applies each device's non-volatile spec in parallel +// if update is correct, applies status condition PendingReboot, otherwise NonVolatileConfigUpdateFailed +// sets rebootRequired flags for each device's configuration status +// if status.nvConfigUpdateRequired == false, skips the device +// returns nil if all devices' config updates were successful, error otherwise +func (r *NicDeviceReconciler) applyNvConfig(ctx context.Context, statuses nicDeviceConfigurationStatuses) error { + var wg sync.WaitGroup + + for i := 0; i < len(statuses); i++ { + wg.Add(1) + go func(index int) { + defer wg.Done() + + status := statuses[index] + status.lastStageError = nil + if !status.nvConfigUpdateRequired { + return + } + + rebootRequired, err := r.HostManager.ApplyDeviceNvSpec(ctx, statuses[index].device) + if err != nil { + statuses[index].lastStageError = err + if types.IsIncorrectSpecError(err) { + err = r.updateDeviceStatusCondition(ctx, status.device, consts.IncorrectSpecReason, metav1.ConditionFalse, err.Error()) + if err != nil { + log.Log.Error(err, "failed to update device status condition", "device", status.device.Name) + } + } else { + err = r.updateDeviceStatusCondition(ctx, status.device, consts.NonVolatileConfigUpdateFailedReason, metav1.ConditionFalse, err.Error()) + if err != nil { + log.Log.Error(err, "failed to update device status condition", "device", status.device.Name) + } + } + } + err = r.updateDeviceStatusCondition(ctx, status.device, consts.PendingRebootReason, metav1.ConditionTrue, "") + if err != nil { + status.lastStageError = err + } + + statuses[index].rebootRequired = rebootRequired + }(i) + } + + wg.Wait() + + for _, status := range statuses { + if status.lastStageError != nil { + return status.lastStageError + } + } + + return nil +} + +// handleSpecValidation validates each device's spec in parallel +// if spec is correct, applies status condition UpdateStarted, otherwise IncorrectSpec +// sets nvConfigUpdateRequired and rebootRequired flags for each device's configuration status +// returns nil if all devices' specs are correct, error otherwise +func (r *NicDeviceReconciler) handleSpecValidation(ctx context.Context, statuses nicDeviceConfigurationStatuses) error { + var wg sync.WaitGroup + + for i := 0; i < len(statuses); i++ { + wg.Add(1) + go func(index int) { + defer wg.Done() + status := statuses[index] + + nvConfigUpdateRequired, rebootRequired, err := r.HostManager.ValidateDeviceNvSpec(ctx, status.device) + log.Log.V(2).Info("nv spec validation complete for device", "device", status.device.Name, "nvConfigUpdateRequired", nvConfigUpdateRequired, "rebootRequired", rebootRequired) + if err != nil { + log.Log.Error(err, "failed to validate spec for device", "device", status.device.Name) + status.lastStageError = err + if types.IsIncorrectSpecError(err) { + err = r.updateDeviceStatusCondition(ctx, status.device, consts.IncorrectSpecReason, metav1.ConditionFalse, err.Error()) + if err != nil { + log.Log.Error(err, "failed to update device status condition", "device", status.device.Name) + } + } else { + err = r.updateDeviceStatusCondition(ctx, status.device, consts.SpecValidationFailed, metav1.ConditionFalse, err.Error()) + if err != nil { + log.Log.Error(err, "failed to update device status condition", "device", status.device.Name) + } + } + } + if nvConfigUpdateRequired { + log.Log.V(2).Info("update started for device", "device", status.device.Name) + err = r.updateDeviceStatusCondition(ctx, status.device, consts.UpdateStartedReason, metav1.ConditionTrue, "") + if err != nil { + status.lastStageError = err + } + } + + status.nvConfigUpdateRequired = nvConfigUpdateRequired + status.rebootRequired = rebootRequired + }(i) + } + + wg.Wait() + + for _, status := range statuses { + if status.lastStageError != nil { + return status.lastStageError + } + } + + return nil +} + +func (r *NicDeviceReconciler) updateDeviceStatusCondition(ctx context.Context, device *v1alpha1.NicDevice, reason string, status metav1.ConditionStatus, message string) error { + cond := metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: status, + ObservedGeneration: device.Generation, + Reason: reason, + Message: message, + } + changed := meta.SetStatusCondition(&device.Status.Conditions, cond) + var err error + if changed { + err = r.Client.Status().Update(ctx, device) + } + + return err +} + +// SetupWithManager sets up the controller with the Manager. +func (r *NicDeviceReconciler) SetupWithManager(mgr ctrl.Manager, watchForMaintenance bool) error { + qHandler := func(q workqueue.TypedRateLimitingInterface[reconcile.Request]) { + q.Add(reconcile.Request{NamespacedName: k8sTypes.NamespacedName{ + Namespace: "", + Name: nicDeviceSyncEventName, + }}) + } + + eventHandler := handler.Funcs{ + // We skip create event because it's always followed by a status update + UpdateFunc: func(ctx context.Context, e event.UpdateEvent, q workqueue.TypedRateLimitingInterface[reconcile.Request]) { + device := e.ObjectNew.(*v1alpha1.NicDevice) + + if device.Status.Node != r.NodeName { + // We want to skip event from devices not on the current node + return + } + + log.Log.Info("Enqueuing sync for update event", "resource", e.ObjectNew.GetName()) + qHandler(q) + }, + DeleteFunc: func(ctx context.Context, e event.DeleteEvent, q workqueue.TypedRateLimitingInterface[reconcile.Request]) { + device := e.Object.(*v1alpha1.NicDevice) + + if device.Status.Node != r.NodeName { + return + } + + log.Log.Info("Enqueuing sync for delete event", "resource", e.Object.GetName()) + qHandler(q) + }, + GenericFunc: func(ctx context.Context, e event.GenericEvent, q workqueue.TypedRateLimitingInterface[reconcile.Request]) { + device := e.Object.(*v1alpha1.NicDevice) + + if device.Status.Node != r.NodeName { + return + } + + log.Log.Info("Enqueuing sync for generic event", "resource", e.Object.GetName()) + qHandler(q) + }, + } + + controller := ctrl.NewControllerManagedBy(mgr). + For(&v1alpha1.NicDevice{}). + Watches(&v1alpha1.NicDevice{}, eventHandler) + + if watchForMaintenance { + maintenanceEventHandler := handler.Funcs{ + // We only want status update events + UpdateFunc: func(ctx context.Context, e event.UpdateEvent, q workqueue.TypedRateLimitingInterface[reconcile.Request]) { + nm := e.ObjectNew.(*maintenanceoperator.NodeMaintenance) + + if nm.Spec.RequestorID != consts.MaintenanceRequestor || nm.Spec.NodeName != r.NodeName { + // We want to skip event from maintenance not on the current node or not scheduled by us + return + } + + log.Log.Info("Enqueuing sync for maintenance update event", "resource", e.ObjectNew.GetName()) + qHandler(q) + }, + } + + controller.Watches(&maintenanceoperator.NodeMaintenance{}, maintenanceEventHandler) + } + + return controller. + Named("nicDeviceReconciler"). + Complete(r) +} + +// nvConfigReadyForAll returns true if nv config is ready for ALL devices, false if not ready for at least one device +func (p nicDeviceConfigurationStatuses) nvConfigReadyForAll() bool { + nvConfigReadyForAll := true + for _, result := range p { + if result.nvConfigUpdateRequired || result.rebootRequired { + nvConfigReadyForAll = false + log.Log.V(2).Info("nv config not ready for device", "device", result.device) + } + } + + log.Log.V(2).Info("nv config ready for all devices", "ready", nvConfigReadyForAll) + return nvConfigReadyForAll +} + +// rebootRequired returns true if reboot required for at least one device, false if not required for any device +func (p nicDeviceConfigurationStatuses) rebootRequired() bool { + rebootRequiredForSome := false + for _, result := range p { + if result.rebootRequired { + rebootRequiredForSome = true + log.Log.V(2).Info("reboot required for device", "device", result.device) + } + } + + return rebootRequiredForSome +} + +// nvConfigUpdateRequired returns true if nv config update required for at least one device, false if not required for any device +func (p nicDeviceConfigurationStatuses) nvConfigUpdateRequired() bool { + nvConfigUpdateRequiredForSome := false + for _, result := range p { + if result.nvConfigUpdateRequired { + nvConfigUpdateRequiredForSome = true + } + } + + log.Log.V(2).Info("nv config change required for some devices") + return nvConfigUpdateRequiredForSome +} diff --git a/internal/controller/nicdevice_controller_test.go b/internal/controller/nicdevice_controller_test.go new file mode 100644 index 0000000..ccd9a0e --- /dev/null +++ b/internal/controller/nicdevice_controller_test.go @@ -0,0 +1,642 @@ +/* +2024 NVIDIA CORPORATION & AFFILIATES +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "errors" + "sync" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/stretchr/testify/mock" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sTypes "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/json" + "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/config" + "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" + "github.com/Mellanox/nic-configuration-operator/pkg/consts" + hostMocks "github.com/Mellanox/nic-configuration-operator/pkg/host/mocks" + maintenanceMocks "github.com/Mellanox/nic-configuration-operator/pkg/maintenance/mocks" + "github.com/Mellanox/nic-configuration-operator/pkg/testutils" + "github.com/Mellanox/nic-configuration-operator/pkg/types" +) + +const specValidationFailed = "spec validation failed" + +var _ = Describe("NicDeviceReconciler", func() { + var ( + mgr manager.Manager + reconciler *NicDeviceReconciler + hostManager *hostMocks.HostManager + maintenanceManager *maintenanceMocks.MaintenanceManager + nodeName = "test-node" + deviceName = "test-device" + ctx context.Context + cancel context.CancelFunc + timeout = time.Second * 10 + namespaceName string + wg sync.WaitGroup + err error + startManager = func() { + wg = sync.WaitGroup{} + wg.Add(1) + go func() { + defer wg.Done() + defer GinkgoRecover() + Expect(mgr.Start(ctx)).To(Succeed()) + }() + //time.Sleep(1 * time.Second) + } + ) + + BeforeEach(func() { + ctx, cancel = context.WithCancel(context.TODO()) + mgr, err = ctrl.NewManager(cfg, ctrl.Options{ + Scheme: k8sClient.Scheme(), + Metrics: metricsserver.Options{BindAddress: "0"}, + Controller: config.Controller{SkipNameValidation: ptr.To(true)}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(mgr).NotTo(BeNil()) + + node := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}} + Expect(k8sClient.Create(ctx, node)).To(Succeed()) + + namespaceName = "nic-configuration-operator-" + rand.String(6) + ns := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{ + Name: namespaceName, + }} + Expect(k8sClient.Create(context.Background(), ns)).To(Succeed()) + + err = mgr.GetCache().IndexField(context.Background(), &v1alpha1.NicDevice{}, "status.node", func(o client.Object) []string { + return []string{o.(*v1alpha1.NicDevice).Status.Node} + }) + Expect(err).NotTo(HaveOccurred()) + + deviceDiscoveryReconcileTime = 1 * time.Second + hostManager = &hostMocks.HostManager{} + maintenanceManager = &maintenanceMocks.MaintenanceManager{} + + reconciler = &NicDeviceReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + NodeName: nodeName, + NamespaceName: namespaceName, + HostManager: hostManager, + MaintenanceManager: maintenanceManager, + } + Expect(reconciler.SetupWithManager(mgr, false)).To(Succeed()) + }) + + AfterEach(func() { + Expect(k8sClient.DeleteAllOf(ctx, &v1alpha1.NicDevice{}, client.InNamespace(namespaceName))).To(Succeed()) + Expect(k8sClient.Delete(ctx, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespaceName}})).To(Succeed()) + Expect(k8sClient.DeleteAllOf(ctx, &v1.Node{})).To(Succeed()) + cancel() + wg.Wait() + }) + + Describe("nicDeviceConfigurationStatuses tests", func() { + It("should return correct flag if nvConfigUpdateRequired", func() { + statuses := nicDeviceConfigurationStatuses{{nvConfigUpdateRequired: true}, {nvConfigUpdateRequired: false}} + Expect(statuses.nvConfigUpdateRequired()).To(Equal(true)) + statuses = nicDeviceConfigurationStatuses{{nvConfigUpdateRequired: true}, {nvConfigUpdateRequired: true}} + Expect(statuses.nvConfigUpdateRequired()).To(Equal(true)) + statuses = nicDeviceConfigurationStatuses{{nvConfigUpdateRequired: false}, {nvConfigUpdateRequired: false}} + Expect(statuses.nvConfigUpdateRequired()).To(Equal(false)) + }) + + It("should return correct flag if rebootRequired", func() { + statuses := nicDeviceConfigurationStatuses{{rebootRequired: true}, {rebootRequired: false}} + Expect(statuses.rebootRequired()).To(Equal(true)) + statuses = nicDeviceConfigurationStatuses{{rebootRequired: true}, {rebootRequired: true}} + Expect(statuses.rebootRequired()).To(Equal(true)) + statuses = nicDeviceConfigurationStatuses{{rebootRequired: false}, {rebootRequired: false}} + Expect(statuses.rebootRequired()).To(Equal(false)) + }) + + It("should return correct flag if nvConfigReadyForAll", func() { + statuses := nicDeviceConfigurationStatuses{ + {rebootRequired: true, nvConfigUpdateRequired: true}, + {rebootRequired: false, nvConfigUpdateRequired: false}, + } + Expect(statuses.nvConfigReadyForAll()).To(Equal(false)) + statuses = nicDeviceConfigurationStatuses{ + {rebootRequired: true, nvConfigUpdateRequired: false}, + {rebootRequired: true, nvConfigUpdateRequired: false}, + } + Expect(statuses.nvConfigReadyForAll()).To(Equal(false)) + statuses = nicDeviceConfigurationStatuses{ + {rebootRequired: false, nvConfigUpdateRequired: true}, + {rebootRequired: false, nvConfigUpdateRequired: true}, + } + Expect(statuses.nvConfigReadyForAll()).To(Equal(false)) + statuses = nicDeviceConfigurationStatuses{ + {rebootRequired: true, nvConfigUpdateRequired: false}, + {rebootRequired: false, nvConfigUpdateRequired: true}, + } + Expect(statuses.nvConfigReadyForAll()).To(Equal(false)) + statuses = nicDeviceConfigurationStatuses{ + {rebootRequired: false, nvConfigUpdateRequired: false}, + {rebootRequired: false, nvConfigUpdateRequired: false}, + } + Expect(statuses.nvConfigReadyForAll()).To(Equal(true)) + }) + }) + + Describe("updateDeviceStatusCondition", func() { + It("should set the given status condition for device", func() { + + device := &v1alpha1.NicDevice{ObjectMeta: metav1.ObjectMeta{Name: deviceName, Namespace: namespaceName}} + Expect(k8sClient.Create(ctx, device)) + device.Status = v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{{ + PCI: "0000:3b:00.0", + }}, + } + Expect(k8sClient.Status().Update(ctx, device)).To(Succeed()) + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + + err := reconciler.updateDeviceStatusCondition(ctx, device, "TestReason", metav1.ConditionTrue, "test-message") + Expect(err).NotTo(HaveOccurred()) + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: "TestReason", + Message: "test-message", + })) + + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + err = reconciler.updateDeviceStatusCondition(ctx, device, "AnotherTestReason", metav1.ConditionFalse, "") + Expect(err).NotTo(HaveOccurred()) + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionFalse, + Reason: "AnotherTestReason", + Message: "", + })) + }) + }) + + Describe("reconcile a single device", func() { + var createDevice = func(setLastSpecAnnotation bool) { + device := &v1alpha1.NicDevice{ + ObjectMeta: metav1.ObjectMeta{Name: deviceName, Namespace: namespaceName}, + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + ResetToDefault: false, + Template: &v1alpha1.ConfigurationTemplateSpec{ + NumVfs: 4, + LinkType: consts.Ethernet, + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + MaxAccOutRead: 9999, + }, + RawNvConfig: []v1alpha1.NvConfigParam{{ + Name: "CUSTOM_PARAM", + Value: "true", + }}, + }, + }, + }, + } + if setLastSpecAnnotation { + device.SetAnnotations(map[string]string{consts.LastAppliedStateAnnotation: "some-state"}) + } + + Expect(k8sClient.Create(ctx, device)) + device.Status = v1alpha1.NicDeviceStatus{ + Node: nodeName, + Ports: []v1alpha1.NicDevicePortSpec{{ + PCI: "0000:3b:00.0", + }}, + } + Expect(k8sClient.Status().Update(ctx, device)).To(Succeed()) + } + + It("Should not reconcile device not from its node", func() { + device := &v1alpha1.NicDevice{ObjectMeta: metav1.ObjectMeta{Name: "another-name", Namespace: namespaceName}} + Expect(k8sClient.Create(ctx, device)) + device.Status = v1alpha1.NicDeviceStatus{ + Node: "some-other-node", + Ports: []v1alpha1.NicDevicePortSpec{{ + PCI: "0000:3b:00.0", + }}, + } + Expect(k8sClient.Status().Update(ctx, device)).To(Succeed()) + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: "another-name", Namespace: namespaceName}, device)).To(Succeed()) + + startManager() + + Consistently(func() []metav1.Condition { + maintenanceManager.AssertNotCalled(GinkgoT(), "ReleaseMaintenance", mock.Anything) + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: "another-name", Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }).Should(BeNil()) + }) + It("Should result in SpecValidationFailed status if spec validation failed", func() { + hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(false, false, errors.New(specValidationFailed)) + + createDevice(false) + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionFalse, + Reason: consts.SpecValidationFailed, + Message: specValidationFailed, + })) + }) + It("Should result in IncorrectSpec status if spec is incorrect", func() { + err := types.IncorrectSpecError("spec error") + errorText := err.Error() + hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(false, false, err) + + createDevice(false) + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionFalse, + Reason: consts.IncorrectSpecReason, + Message: errorText, + })) + }) + It("Should result in UpdateSuccessful status if nv config updates or reboot are not required", func() { + hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(false, false, nil) + hostManager.On("ApplyDeviceRuntimeSpec", mock.Anything).Return(nil) + maintenanceManager.On("ReleaseMaintenance", mock.Anything).Return(nil) + + createDevice(false) + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionFalse, + Reason: consts.UpdateSuccessfulReason, + })) + + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + spec, err := json.Marshal(device.Spec) + Expect(err).NotTo(HaveOccurred()) + + // Should dump the last applied state to annotations + Eventually(func() string { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + val := device.Annotations[consts.LastAppliedStateAnnotation] + return val + }).Should(Equal(string(spec))) + + maintenanceManager.AssertCalled(GinkgoT(), "ReleaseMaintenance", mock.Anything) + maintenanceManager.AssertExpectations(GinkgoT()) + }) + It("Should keep in UpdateStarted status if maintenance fails to schedule", func() { + errorText := "maintenance request failed" + hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, false, nil) + maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(errors.New(errorText)) + + createDevice(true) + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.UpdateStartedReason, + Message: "", // Should not copy the error message from the failed maintenance request + })) + // Should keep this status consistently + Consistently(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.UpdateStartedReason, + })) + // Should not reset last applied state annotation if maintenance was not scheduled + Consistently(func() bool { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + _, found := device.Annotations[consts.LastAppliedStateAnnotation] + return found + }).Should(BeTrue()) + }) + It("Should keep in UpdateStarted status if maintenance is not allowed", func() { + hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, false, nil) + maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(nil) + maintenanceManager.On("MaintenanceAllowed", mock.Anything).Return(false, nil) + + createDevice(true) + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.UpdateStartedReason, + })) + // Should keep this status consistently + Consistently(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.UpdateStartedReason, + })) + // Should not reset last applied state annotation if maintenance was not allowed + Consistently(func() bool { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + _, found := device.Annotations[consts.LastAppliedStateAnnotation] + return found + }).Should(BeTrue()) + }) + It("Should result in NonVolatileConfigUpdateFailed status if nv config fails to apply", func() { + errorText := "maintenance request failed" + hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, false, nil) + maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(nil) + maintenanceManager.On("MaintenanceAllowed", mock.Anything).Return(true, nil) + hostManager.On("ApplyDeviceNvSpec", mock.Anything, mock.Anything).Return(false, errors.New(errorText)) + + createDevice(false) + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionFalse, + Reason: consts.NonVolatileConfigUpdateFailedReason, + Message: errorText, + })) + }) + It("Should result in Pending status and not apply runtime spec if failed to reboot", func() { + errorText := "reboot request failed" + hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, true, nil) + maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(nil) + maintenanceManager.On("MaintenanceAllowed", mock.Anything).Return(true, nil) + hostManager.On("ApplyDeviceNvSpec", mock.Anything, mock.Anything).Return(true, nil) + maintenanceManager.On("Reboot").Return(errors.New(errorText)) + + createDevice(true) + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.PendingRebootReason, + })) + + // Should reset last applied state annotation if tried to reboot but failed + Eventually(func() bool { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + _, found := device.Annotations[consts.LastAppliedStateAnnotation] + return found + }).Should(BeFalse()) + + maintenanceManager.AssertNotCalled(GinkgoT(), "ApplyDeviceRuntimeSpec", mock.Anything) + maintenanceManager.AssertExpectations(GinkgoT()) + }) + It("Should not release maintenance if runtime config failed to apply", func() { + errorText := "runtime config update failed" + hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(false, false, nil) + hostManager.On("ApplyDeviceRuntimeSpec", mock.Anything).Return(errors.New(errorText)) + + createDevice(false) + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionFalse, + Reason: consts.RuntimeConfigUpdateFailedReason, + Message: errorText, + })) + + maintenanceManager.AssertNotCalled(GinkgoT(), "ScheduleMaintenance", mock.Anything) + maintenanceManager.AssertNotCalled(GinkgoT(), "MaintenanceAllowed", mock.Anything) + hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceNvSpec", mock.Anything, mock.Anything) + maintenanceManager.AssertNotCalled(GinkgoT(), "Reboot") + maintenanceManager.AssertNotCalled(GinkgoT(), "ReleaseMaintenance", mock.Anything) + hostManager.AssertExpectations(GinkgoT()) + maintenanceManager.AssertExpectations(GinkgoT()) + }) + It("Should request maintenance if runtime config needs to be reset", func() { + hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(false, false, nil) + maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(nil) + maintenanceManager.On("MaintenanceAllowed", mock.Anything).Return(false, nil) + + createDevice(true) // lastAppliedSpec will not match the current resulting in need to reboot + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.PendingRebootReason, + })) + + hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceNvSpec", mock.Anything, mock.Anything) + maintenanceManager.AssertNotCalled(GinkgoT(), "ReleaseMaintenance", mock.Anything) + hostManager.AssertExpectations(GinkgoT()) + maintenanceManager.AssertExpectations(GinkgoT()) + }) + }) + Describe("reconcile multiple devices", func() { + var ( + secondDeviceName = "second-test-device" + + createDevices = func() { + device := &v1alpha1.NicDevice{ObjectMeta: metav1.ObjectMeta{Name: deviceName, Namespace: namespaceName}, Spec: v1alpha1.NicDeviceSpec{Configuration: &v1alpha1.NicDeviceConfigurationSpec{ResetToDefault: true}}} + Expect(k8sClient.Create(ctx, device)) + device.Status = v1alpha1.NicDeviceStatus{ + Node: nodeName, + Ports: []v1alpha1.NicDevicePortSpec{{ + PCI: "0000:3b:00.0", + }}, + } + Expect(k8sClient.Status().Update(ctx, device)).To(Succeed()) + + device = &v1alpha1.NicDevice{ObjectMeta: metav1.ObjectMeta{Name: secondDeviceName, Namespace: namespaceName}, Spec: v1alpha1.NicDeviceSpec{Configuration: &v1alpha1.NicDeviceConfigurationSpec{ResetToDefault: true}}} + Expect(k8sClient.Create(ctx, device)) + device.Status = v1alpha1.NicDeviceStatus{ + Node: nodeName, + Ports: []v1alpha1.NicDevicePortSpec{{ + PCI: "0000:d8:00.0", + }}, + } + Expect(k8sClient.Status().Update(ctx, device)).To(Succeed()) + } + + matchFirstDevice = mock.MatchedBy(func(input *v1alpha1.NicDevice) bool { + return input.Name == deviceName + }) + + matchSecondDevice = mock.MatchedBy(func(input *v1alpha1.NicDevice) bool { + return input.Name == secondDeviceName + }) + ) + + It("Should not begin maintenance and apply spec for device if spec validation failed for other device", func() { + hostManager.On("ValidateDeviceNvSpec", mock.Anything, matchSecondDevice).Return(true, true, nil) + hostManager.On("ValidateDeviceNvSpec", mock.Anything, matchFirstDevice).Return(false, false, errors.New(specValidationFailed)) + + createDevices() + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionFalse, + Reason: consts.SpecValidationFailed, + Message: specValidationFailed, + })) + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: secondDeviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.UpdateStartedReason, + })) + + maintenanceManager.AssertNotCalled(GinkgoT(), "ScheduleMaintenance", mock.Anything) + maintenanceManager.AssertExpectations(GinkgoT()) + }) + + It("Should not apply runtime spec for device if spec validation failed for other device", func() { + hostManager.On("ValidateDeviceNvSpec", mock.Anything, matchSecondDevice).Return(false, false, nil) + hostManager.On("ValidateDeviceNvSpec", mock.Anything, matchFirstDevice).Return(false, false, errors.New(specValidationFailed)) + + createDevices() + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionFalse, + Reason: consts.SpecValidationFailed, + Message: specValidationFailed, + })) + + Consistently(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: secondDeviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }).Should(BeEmpty()) // No operation are performed on this device, no reason to change status + + hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceRuntimeSpec", mock.Anything) + maintenanceManager.AssertExpectations(GinkgoT()) + }) + + It("Should not apply runtime spec for device if nv spec apply needed for other device", func() { + hostManager.On("ValidateDeviceNvSpec", mock.Anything, matchSecondDevice).Return(false, false, nil) + hostManager.On("ValidateDeviceNvSpec", mock.Anything, matchFirstDevice).Return(true, true, nil) + maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(nil) + maintenanceManager.On("MaintenanceAllowed", mock.Anything).Return(true, nil) + hostManager.On("ApplyDeviceNvSpec", mock.Anything, matchFirstDevice).Return(true, nil) + maintenanceManager.On("Reboot").Return(nil) + + createDevices() + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, time.Minute*2).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.PendingRebootReason, + })) + + Consistently(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: secondDeviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }).Should(BeEmpty()) // No operation are performed on this device, no reason to change status + + hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceRuntimeSpec", mock.Anything) + maintenanceManager.AssertExpectations(GinkgoT()) + }) + }) +}) diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 7918f74..4d4a3a0 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -23,6 +23,7 @@ import ( "runtime" "testing" + "github.com/Mellanox/nic-configuration-operator/pkg/ncolog" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -56,6 +57,8 @@ func TestControllers(t *testing.T) { var _ = BeforeSuite(func() { logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + ncolog.SetLogLevel(2) + By("bootstrapping test environment") testEnv = &envtest.Environment{ CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, @@ -80,7 +83,7 @@ var _ = BeforeSuite(func() { Expect(err).NotTo(HaveOccurred()) //+kubebuilder:scaffold:scheme - k8sClient, err := client.New(cfg, client.Options{Scheme: scheme.Scheme}) + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) Expect(err).NotTo(HaveOccurred()) Expect(k8sClient).NotTo(BeNil()) diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go index 8bd77fc..1d35862 100644 --- a/pkg/consts/consts.go +++ b/pkg/consts/consts.go @@ -21,7 +21,14 @@ const ( Ethernet = "Ethernet" Infiniband = "Infiniband" - ConfigUpdateInProgressCondition = "ConfigUpdateInProgress" + ConfigUpdateInProgressCondition = "ConfigUpdateInProgress" + IncorrectSpecReason = "IncorrectSpec" + UpdateStartedReason = "UpdateStarted" + PendingRebootReason = "PendingReboot" + NonVolatileConfigUpdateFailedReason = "NonVolatileConfigUpdateFailed" + RuntimeConfigUpdateFailedReason = "RuntimeConfigUpdateFailed" + UpdateSuccessfulReason = "UpdateSuccessful" + SpecValidationFailed = "SpecValidationFailed" DeviceConfigSpecEmptyReason = "DeviceConfigSpecEmpty" @@ -29,6 +36,40 @@ const ( SerialNumberPrefix = "sn:" FirmwareVersionPrefix = "fw version:" PSIDPrefix = "psid:" + LinkStatsPrefix = "lnksta" + MaxReadReqPrefix = "maxreadreq" + TrustStatePrefix = "priority trust state:" + PfcEnabledPrefix = "enabled" NetClass = 0x02 + + LastAppliedStateAnnotation = "lastAppliedState" + + NvParamFalse = "0" + NvParamTrue = "1" + NvParamLinkTypeInfiniband = "1" + NvParamLinkTypeEthernet = "2" + + SriovEnabledParam = "SRIOV_EN" + SriovNumOfVfsParam = "NUM_OF_VFS" + LinkTypeP1Param = "LINK_TYPE_P1" + LinkTypeP2Param = "LINK_TYPE_P2" + MaxAccOutReadParam = "MAX_ACC_OUT_READ" + RoceCcPrioMaskP1Param = "ROCE_CC_PRIO_MASK_P1" + RoceCcPrioMaskP2Param = "ROCE_CC_PRIO_MASK_P2" + CnpDscpP1Param = "CNP_DSCP_P1" + CnpDscpP2Param = "CNP_DSCP_P2" + Cnp802pPrioP1Param = "CNP_802P_PRIO_P1" + Cnp802pPrioP2Param = "CNP_802P_PRIO_P2" + AtsEnabledParam = "ATS_ENABLED" + AdvancedPCISettingsParam = "ADVANCED_PCI_SETTINGS" + + SecondPortPrefix = "P2" + + EnvBaremetal = "Baremetal" + + MaintenanceRequestor = "configuration.nic.mellanox.com" + MaintenanceRequestName = "nic-configuration-operator-maintenance" + + HostPath = "/host" ) diff --git a/pkg/host/configvalidation.go b/pkg/host/configvalidation.go new file mode 100644 index 0000000..8ddcf67 --- /dev/null +++ b/pkg/host/configvalidation.go @@ -0,0 +1,312 @@ +/* +2024 NVIDIA CORPORATION & AFFILIATES +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package host + +import ( + "fmt" + "reflect" + "strconv" + "strings" + + "sigs.k8s.io/controller-runtime/pkg/log" + + "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" + "github.com/Mellanox/nic-configuration-operator/pkg/consts" + "github.com/Mellanox/nic-configuration-operator/pkg/types" +) + +type configValidation interface { + // ConstructNvParamMapFromTemplate translates a configuration template into a set of nvconfig parameters + // operates under the assumption that spec validation was already carried out + ConstructNvParamMapFromTemplate( + device *v1alpha1.NicDevice, defaultValues map[string]string) (map[string]string, error) + // ValidateResetToDefault checks if device's nv config has been reset to default in current and next boots + // returns bool - need to perform reset + // returns bool - reboot required + // returns error - if an error occurred during validation + ValidateResetToDefault(nvConfig types.NvConfigQuery) (bool, bool, error) + // AdvancedPCISettingsEnabled returns true if ADVANCED_PCI_SETTINGS param is enabled for current config + AdvancedPCISettingsEnabled(currentConfig map[string]string) bool + // RuntimeConfigApplied checks if desired runtime config is applied + RuntimeConfigApplied(device *v1alpha1.NicDevice) (bool, error) + // CalculateDesiredRuntimeConfig returns desired values for runtime config + // returns int - maxReadRequestSize + // returns string - qos trust mode + // returns string - qos pfc settings + CalculateDesiredRuntimeConfig(device *v1alpha1.NicDevice) (int, string, string) +} + +type configValidationImpl struct { + utils HostUtils +} + +func nvParamLinkTypeFromName(linkType string) string { + if linkType == consts.Infiniband { + return consts.NvParamLinkTypeInfiniband + } else if linkType == consts.Ethernet { + return consts.NvParamLinkTypeEthernet + } + + return "" +} + +func applyDefaultNvConfigValueIfExists( + paramName string, desiredParameters map[string]string, defaultValues map[string]string) { + defaultValue, found := defaultValues[paramName] + // Default values might not yet be available if ENABLE_PCI_OPTIMIZATIONS is disabled + if found { + desiredParameters[paramName] = defaultValue + } +} + +// ConstructNvParamMapFromTemplate translates a configuration template into a set of nvconfig parameters +// operates under the assumption that spec validation was already carried out +func (v *configValidationImpl) ConstructNvParamMapFromTemplate( + device *v1alpha1.NicDevice, defaultValues map[string]string) (map[string]string, error) { + desiredParameters := map[string]string{} + + template := device.Spec.Configuration.Template + secondPortPresent := len(device.Status.Ports) > 1 + pciAddr := device.Status.Ports[0].PCI + + desiredParameters[consts.SriovEnabledParam] = consts.NvParamFalse + desiredParameters[consts.SriovNumOfVfsParam] = "0" + if template.NumVfs > 0 { + desiredParameters[consts.SriovEnabledParam] = consts.NvParamTrue + desiredParameters[consts.SriovNumOfVfsParam] = strconv.Itoa(template.NumVfs) + } + + if template.LinkType != "" { + // Link type change is not allowed on some devices + if _, found := defaultValues[consts.LinkTypeP1Param]; found { + linkType := nvParamLinkTypeFromName(string(template.LinkType)) + desiredParameters[consts.LinkTypeP1Param] = linkType + if secondPortPresent { + desiredParameters[consts.LinkTypeP2Param] = linkType + } + } + if device.Status.Ports[0].NetworkInterface != "" && + v.utils.GetLinkType(device.Status.Ports[0].NetworkInterface) != + string(device.Spec.Configuration.Template.LinkType) { + err := types.IncorrectSpecError( + fmt.Sprintf( + "device doesn't support link type change, wrong link type provided in the template, should be: %s", + v.utils.GetLinkType(pciAddr))) + log.Log.Error(err, "incorrect spec", "device", device.Name) + return desiredParameters, err + } + } + + if template.PciPerformanceOptimized != nil && template.PciPerformanceOptimized.Enabled { + if template.PciPerformanceOptimized.MaxAccOutRead != 0 { + desiredParameters[consts.MaxAccOutReadParam] = strconv.Itoa(template.PciPerformanceOptimized.MaxAccOutRead) + } else { + // If not specified, use the best parameters for specific PCI gen + pciLinkSpeed, err := v.utils.GetPCILinkSpeed(pciAddr) + if err != nil { + log.Log.Error(err, "failed to get PCI link speed", "pciAddr", pciAddr) + return desiredParameters, err + } + if pciLinkSpeed == 16 { // Gen4 + desiredParameters[consts.MaxAccOutReadParam] = "44" + } else if pciLinkSpeed > 16 { // Gen5 + desiredParameters[consts.MaxAccOutReadParam] = "0" + } + } + + // maxReadRequest is applied as runtime configuration + } else { + applyDefaultNvConfigValueIfExists(consts.MaxAccOutReadParam, desiredParameters, defaultValues) + } + + if template.RoceOptimized != nil && template.RoceOptimized.Enabled { + desiredParameters[consts.RoceCcPrioMaskP1Param] = "255" + desiredParameters[consts.CnpDscpP1Param] = "4" + desiredParameters[consts.Cnp802pPrioP1Param] = "6" + + if secondPortPresent { + desiredParameters[consts.RoceCcPrioMaskP2Param] = "255" + desiredParameters[consts.CnpDscpP2Param] = "4" + desiredParameters[consts.Cnp802pPrioP2Param] = "6" + } + + // qos settings are applied as runtime configuration + } else { + applyDefaultNvConfigValueIfExists(consts.RoceCcPrioMaskP1Param, desiredParameters, defaultValues) + applyDefaultNvConfigValueIfExists(consts.CnpDscpP1Param, desiredParameters, defaultValues) + applyDefaultNvConfigValueIfExists(consts.Cnp802pPrioP1Param, desiredParameters, defaultValues) + if secondPortPresent { + applyDefaultNvConfigValueIfExists(consts.RoceCcPrioMaskP2Param, desiredParameters, defaultValues) + applyDefaultNvConfigValueIfExists(consts.CnpDscpP2Param, desiredParameters, defaultValues) + applyDefaultNvConfigValueIfExists(consts.Cnp802pPrioP2Param, desiredParameters, defaultValues) + } + } + + if template.GpuDirectOptimized != nil && template.GpuDirectOptimized.Enabled { + if template.GpuDirectOptimized.Env != consts.EnvBaremetal { + err := types.IncorrectSpecError("GpuDirectOptimized supports only Baremetal env") + log.Log.Error(err, "incorrect spec", "device", device.Name) + return desiredParameters, err + } + + desiredParameters[consts.AtsEnabledParam] = consts.NvParamFalse + if template.PciPerformanceOptimized == nil || !template.PciPerformanceOptimized.Enabled { + err := types.IncorrectSpecError( + "GpuDirectOptimized should only be enabled together with PciPerformanceOptimized") + log.Log.Error(err, "incorrect spec", "device", device.Name) + return desiredParameters, err + } + } else { + applyDefaultNvConfigValueIfExists(consts.AtsEnabledParam, desiredParameters, defaultValues) + } + + for _, rawParam := range template.RawNvConfig { + // Ignore second port params if device has a single port + if strings.HasSuffix(rawParam.Name, consts.SecondPortPrefix) && !secondPortPresent { + continue + } + + desiredParameters[rawParam.Name] = rawParam.Value + } + + return desiredParameters, nil +} + +// ValidateResetToDefault checks if device's nv config has been reset to default in current and next boots +// returns bool - need to perform reset +// returns bool - reboot required +// returns error - if an error occurred during validation +func (v *configValidationImpl) ValidateResetToDefault(nvConfig types.NvConfigQuery) (bool, bool, error) { + // ResetToDefault requires us to set ADVANCED_PCI_SETTINGS=true, which is not a default value + // Deleting this key from maps so that it doesn't interfere with comparisons + delete(nvConfig.DefaultConfig, consts.AdvancedPCISettingsParam) + + alreadyResetInCurrent := false + willResetInNextBoot := false + + advancedPciSettingsEnabledInCurrentConfig := false + if value, found := nvConfig.CurrentConfig[consts.AdvancedPCISettingsParam]; found && value == consts.NvParamTrue { + advancedPciSettingsEnabledInCurrentConfig = true + } + if advancedPciSettingsEnabledInCurrentConfig { + delete(nvConfig.CurrentConfig, consts.AdvancedPCISettingsParam) + if reflect.DeepEqual(nvConfig.CurrentConfig, nvConfig.DefaultConfig) { + alreadyResetInCurrent = true + } + } + + advancedPciSettingsEnabledInNextBootConfig := false + if value, found := nvConfig.NextBootConfig[consts.AdvancedPCISettingsParam]; found && value == consts.NvParamTrue { + advancedPciSettingsEnabledInNextBootConfig = true + } + if advancedPciSettingsEnabledInNextBootConfig { + delete(nvConfig.NextBootConfig, consts.AdvancedPCISettingsParam) + if reflect.DeepEqual(nvConfig.NextBootConfig, nvConfig.DefaultConfig) { + willResetInNextBoot = true + } + } + + // Reset complete, nothing to do for now + if alreadyResetInCurrent && willResetInNextBoot { + return false, false, nil + } + // Reset will complete after reboot + if willResetInNextBoot { + return false, true, nil + } + // Reset required + return true, true, nil +} + +// AdvancedPCISettingsEnabled returns true if ADVANCED_PCI_SETTINGS param is enabled for current config +func (v *configValidationImpl) AdvancedPCISettingsEnabled(currentConfig map[string]string) bool { + if value, found := currentConfig[consts.AdvancedPCISettingsParam]; found && value == consts.NvParamTrue { + return true + } + return false +} + +// RuntimeConfigApplied checks if desired runtime config is applied +func (v *configValidationImpl) RuntimeConfigApplied(device *v1alpha1.NicDevice) (bool, error) { + ports := device.Status.Ports + + // TODO uncomment after a fix to mlnx_qos command + //desiredMaxReadReqSize, desiredTrust, desiredPfc := v.CalculateDesiredRuntimeConfig(device) + desiredMaxReadReqSize, _, _ := v.CalculateDesiredRuntimeConfig(device) + + if desiredMaxReadReqSize != 0 { + for _, port := range ports { + actualMaxReadReqSize, err := v.utils.GetMaxReadRequestSize(port.PCI) + if err != nil { + log.Log.Error(err, "can't validate maxReadReqSize", "device", device.Name) + return false, err + } + if actualMaxReadReqSize != desiredMaxReadReqSize { + return false, nil + } + } + } + + // TODO uncomment after a fix to mlnx_qos command + //for _, port := range ports { + // actualTrust, actualPfc, err := v.utils.GetTrustAndPFC(port.NetworkInterface) + // if err != nil { + // log.Log.Error(err, "can't validate QoS settings", "device", device.Name) + // return false, err + // } + // if actualTrust != desiredTrust || actualPfc != desiredPfc { + // return false, nil + // } + //} + + return true, nil +} + +// CalculateDesiredRuntimeConfig returns desired values for runtime config +// returns int - maxReadRequestSize +// returns string - qos trust mode +// returns string - qos pfc settings +func (v *configValidationImpl) CalculateDesiredRuntimeConfig(device *v1alpha1.NicDevice) (int, string, string) { + maxReadRequestSize := 0 + trust := "pcp" + pfc := "0,0,0,0,0,0,0,0" + + template := device.Spec.Configuration.Template + + if template.PciPerformanceOptimized != nil && template.PciPerformanceOptimized.Enabled { + if template.PciPerformanceOptimized.MaxReadRequest != 0 { + maxReadRequestSize = template.PciPerformanceOptimized.MaxReadRequest + } else { + maxReadRequestSize = 4096 + } + } + + if template.RoceOptimized != nil && template.RoceOptimized.Enabled { + trust = "dscp" + pfc = "0,0,0,1,0,0,0,0" + + if template.RoceOptimized.Qos != nil { + trust = template.RoceOptimized.Qos.Trust + pfc = template.RoceOptimized.Qos.PFC + } + } + + return maxReadRequestSize, trust, pfc +} + +func newConfigValidation(utils HostUtils) configValidation { + return &configValidationImpl{utils: utils} +} diff --git a/pkg/host/configvalidation_test.go b/pkg/host/configvalidation_test.go new file mode 100644 index 0000000..1f0ca20 --- /dev/null +++ b/pkg/host/configvalidation_test.go @@ -0,0 +1,745 @@ +/* +2024 NVIDIA CORPORATION & AFFILIATES +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package host + +import ( + "fmt" + + "github.com/Mellanox/nic-configuration-operator/pkg/host/mocks" + "github.com/Mellanox/nic-configuration-operator/pkg/types" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/stretchr/testify/mock" + + "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" + "github.com/Mellanox/nic-configuration-operator/pkg/consts" +) + +const testVal = "testVal" +const anotherTestVal = "anotherTestVal" + +var _ = Describe("ConfigValidationImpl", func() { + var ( + validator *configValidationImpl + mockHostUtils mocks.HostUtils + ) + + BeforeEach(func() { + mockHostUtils = mocks.HostUtils{} + validator = &configValidationImpl{utils: &mockHostUtils} + }) + + Describe("ConstructNvParamMapFromTemplate", func() { + It("should return default values if optional config is disabled", func() { + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + NumVfs: 0, + LinkType: consts.Ethernet, + }, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: "0000:03:00.0"}, + {PCI: "0000:03:00.1"}, + }, + }, + } + defaultValues := map[string]string{ + consts.MaxAccOutReadParam: "testMaxAccOutRead", + consts.RoceCcPrioMaskP1Param: "testRoceCcP1", + consts.CnpDscpP1Param: "testDscpP1", + consts.Cnp802pPrioP1Param: "test802PrioP1", + consts.RoceCcPrioMaskP2Param: "testRoceCcP2", + consts.CnpDscpP2Param: "testDscpP2", + consts.Cnp802pPrioP2Param: "test802PrioP2", + consts.AtsEnabledParam: "testAts", + } + + nvParams, err := validator.ConstructNvParamMapFromTemplate(device, defaultValues) + Expect(err).NotTo(HaveOccurred()) + Expect(nvParams).To(HaveKeyWithValue(consts.SriovEnabledParam, consts.NvParamFalse)) + Expect(nvParams).To(HaveKeyWithValue(consts.SriovNumOfVfsParam, "0")) + Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "testMaxAccOutRead")) + Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "testRoceCcP1")) + Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP1Param, "testDscpP1")) + Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP1Param, "test802PrioP1")) + Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP2Param, "testRoceCcP2")) + Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP2Param, "testDscpP2")) + Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP2Param, "test802PrioP2")) + Expect(nvParams).To(HaveKeyWithValue(consts.AtsEnabledParam, "testAts")) + }) + + It("should omit parameters for the second port if device is single port", func() { + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + NumVfs: 0, + LinkType: consts.Ethernet, + }, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: "0000:03:00.0"}, + }, + }, + } + defaultValues := map[string]string{ + consts.MaxAccOutReadParam: "testMaxAccOutRead", + consts.RoceCcPrioMaskP1Param: "testRoceCcP1", + consts.CnpDscpP1Param: "testDscpP1", + consts.Cnp802pPrioP1Param: "test802PrioP1", + consts.RoceCcPrioMaskP2Param: "testRoceCcP2", + consts.CnpDscpP2Param: "testDscpP2", + consts.Cnp802pPrioP2Param: "test802PrioP2", + consts.AtsEnabledParam: "testAts", + } + + nvParams, err := validator.ConstructNvParamMapFromTemplate(device, defaultValues) + Expect(err).NotTo(HaveOccurred()) + Expect(nvParams).To(HaveKeyWithValue(consts.SriovEnabledParam, consts.NvParamFalse)) + Expect(nvParams).To(HaveKeyWithValue(consts.SriovNumOfVfsParam, "0")) + Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "testMaxAccOutRead")) + Expect(nvParams).To(HaveKeyWithValue(consts.AtsEnabledParam, "testAts")) + Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "testRoceCcP1")) + Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP1Param, "testDscpP1")) + Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP1Param, "test802PrioP1")) + Expect(nvParams).To(Not(HaveKey(consts.RoceCcPrioMaskP2Param))) + Expect(nvParams).To(Not(HaveKey(consts.CnpDscpP2Param))) + Expect(nvParams).To(Not(HaveKey(consts.Cnp802pPrioP2Param))) + }) + + It("should construct the correct nvparam map with optional optimizations enabled", func() { + mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil) + + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + NumVfs: 0, + LinkType: consts.Ethernet, + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + MaxAccOutRead: 1337, + MaxReadRequest: 1339, + }, + GpuDirectOptimized: &v1alpha1.GpuDirectOptimizedSpec{ + Enabled: true, + Env: consts.EnvBaremetal, + }, + RoceOptimized: &v1alpha1.RoceOptimizedSpec{ + Enabled: true, + Qos: &v1alpha1.QosSpec{ + Trust: "testTrust", + PFC: "testPFC", + }, + }, + }, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: "0000:03:00.0"}, + {PCI: "0000:03:00.1"}, + }, + }, + } + + defaultValues := map[string]string{} + + nvParams, err := validator.ConstructNvParamMapFromTemplate(device, defaultValues) + Expect(err).NotTo(HaveOccurred()) + Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "1337")) + Expect(nvParams).To(HaveKeyWithValue(consts.AtsEnabledParam, "0")) + Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "255")) + Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP1Param, "4")) + Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP1Param, "6")) + Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP2Param, "255")) + Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP2Param, "4")) + Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP2Param, "6")) + }) + + It("should return the correct MaxAccOutRead param for PCIgen4", func() { + mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil) + + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + NumVfs: 0, + LinkType: consts.Ethernet, + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + }, + }, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: "0000:03:00.0"}, + }, + }, + } + + defaultValues := map[string]string{} + + nvParams, err := validator.ConstructNvParamMapFromTemplate(device, defaultValues) + Expect(err).NotTo(HaveOccurred()) + Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "44")) + }) + + It("should return the correct MaxAccOutRead param for PCIgen5", func() { + mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(64, nil) + + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + NumVfs: 0, + LinkType: consts.Ethernet, + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + }, + }, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: "0000:03:00.0"}, + }, + }, + } + + defaultValues := map[string]string{} + + nvParams, err := validator.ConstructNvParamMapFromTemplate(device, defaultValues) + Expect(err).NotTo(HaveOccurred()) + Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "0")) + }) + It("should return an error when GpuOptimized is enabled without PciPerformanceOptimized", func() { + mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil) + + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + NumVfs: 0, + LinkType: consts.Ethernet, + GpuDirectOptimized: &v1alpha1.GpuDirectOptimizedSpec{ + Enabled: true, + Env: consts.EnvBaremetal, + }, + }, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: "0000:03:00.0"}, + }, + }, + } + + defaultValues := map[string]string{} + + _, err := validator.ConstructNvParamMapFromTemplate(device, defaultValues) + Expect(err).To(HaveOccurred()) + }) + It("should ignore raw config for the second port if device is single port", func() { + mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil) + + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + NumVfs: 0, + LinkType: consts.Ethernet, + RawNvConfig: []v1alpha1.NvConfigParam{ + { + Name: "TEST_P1", + Value: "test", + }, + { + Name: "TEST_P2", + Value: "test", + }, + }, + }, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: "0000:03:00.0"}, + }, + }, + } + + defaultValues := map[string]string{} + + nvParams, err := validator.ConstructNvParamMapFromTemplate(device, defaultValues) + Expect(err).NotTo(HaveOccurred()) + Expect(nvParams).To(HaveKeyWithValue("TEST_P1", "test")) + Expect(nvParams).NotTo(HaveKey("TEST_P2")) + }) + It("should apply raw config for the second port if device is dual port", func() { + mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil) + + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + NumVfs: 0, + LinkType: consts.Ethernet, + RawNvConfig: []v1alpha1.NvConfigParam{ + { + Name: "TEST_P1", + Value: "test", + }, + { + Name: "TEST_P2", + Value: "test", + }, + }, + }, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: "0000:03:00.0"}, + {PCI: "0000:03:00.1"}, + }, + }, + } + + defaultValues := map[string]string{} + + nvParams, err := validator.ConstructNvParamMapFromTemplate(device, defaultValues) + Expect(err).NotTo(HaveOccurred()) + Expect(nvParams).To(HaveKeyWithValue("TEST_P1", "test")) + Expect(nvParams).To(HaveKeyWithValue("TEST_P2", "test")) + }) + }) + + Describe("ValidateResetToDefault", func() { + It("should return false, false if device is already reset in current and next boot", func() { + nvConfigQuery := types.NewNvConfigQuery() + nvConfigQuery.CurrentConfig[consts.AdvancedPCISettingsParam] = consts.NvParamTrue + nvConfigQuery.NextBootConfig[consts.AdvancedPCISettingsParam] = consts.NvParamTrue + + nvConfigQuery.DefaultConfig["RandomParam"] = testVal + nvConfigQuery.CurrentConfig["RandomParam"] = testVal + nvConfigQuery.NextBootConfig["RandomParam"] = testVal + + nvConfigChangeRequired, rebootRequired, err := validator.ValidateResetToDefault(nvConfigQuery) + Expect(nvConfigChangeRequired).To(Equal(false)) + Expect(rebootRequired).To(Equal(false)) + Expect(err).NotTo(HaveOccurred()) + }) + + It("should return false, true if reset will complete after reboot", func() { + nvConfigQuery := types.NewNvConfigQuery() + nvConfigQuery.CurrentConfig[consts.AdvancedPCISettingsParam] = consts.NvParamTrue + nvConfigQuery.NextBootConfig[consts.AdvancedPCISettingsParam] = consts.NvParamTrue + + nvConfigQuery.DefaultConfig["RandomParam"] = testVal + nvConfigQuery.CurrentConfig["RandomParam"] = anotherTestVal + nvConfigQuery.NextBootConfig["RandomParam"] = testVal + + nvConfigChangeRequired, rebootRequired, err := validator.ValidateResetToDefault(nvConfigQuery) + Expect(nvConfigChangeRequired).To(Equal(false)) + Expect(rebootRequired).To(Equal(true)) + Expect(err).NotTo(HaveOccurred()) + }) + + It("should return true, true if reset is required", func() { + nvConfigQuery := types.NewNvConfigQuery() + nvConfigQuery.CurrentConfig[consts.AdvancedPCISettingsParam] = consts.NvParamTrue + nvConfigQuery.NextBootConfig[consts.AdvancedPCISettingsParam] = consts.NvParamTrue + + nvConfigQuery.DefaultConfig["RandomParam"] = testVal + nvConfigQuery.CurrentConfig["RandomParam"] = anotherTestVal + nvConfigQuery.NextBootConfig["RandomParam"] = anotherTestVal + + nvConfigChangeRequired, rebootRequired, err := validator.ValidateResetToDefault(nvConfigQuery) + Expect(nvConfigChangeRequired).To(Equal(true)) + Expect(rebootRequired).To(Equal(true)) + Expect(err).NotTo(HaveOccurred()) + }) + }) + Describe("CalculateDesiredRuntimeConfig", func() { + It("should return correct defaults when no optimizations are enabled", func() { + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + PciPerformanceOptimized: nil, + RoceOptimized: nil, + }, + }, + }, + } + + maxReadRequestSize, trust, pfc := validator.CalculateDesiredRuntimeConfig(device) + Expect(maxReadRequestSize).To(Equal(0)) + Expect(trust).To(Equal("pcp")) + Expect(pfc).To(Equal("0,0,0,0,0,0,0,0")) + }) + + It("should calculate maxReadRequestSize when PciPerformanceOptimized is enabled with MaxReadRequest", func() { + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + MaxReadRequest: 1024, + }, + RoceOptimized: nil, + }, + }, + }, + } + + maxReadRequestSize, trust, pfc := validator.CalculateDesiredRuntimeConfig(device) + Expect(maxReadRequestSize).To(Equal(1024)) + Expect(trust).To(Equal("pcp")) + Expect(pfc).To(Equal("0,0,0,0,0,0,0,0")) + }) + + It("should default maxReadReqSize to 4096 when PciPerformanceOptimized is enabled without MaxReadRequest", func() { + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + MaxReadRequest: 0, + }, + RoceOptimized: nil, + }, + }, + }, + } + + maxReadRequestSize, trust, pfc := validator.CalculateDesiredRuntimeConfig(device) + Expect(maxReadRequestSize).To(Equal(4096)) + Expect(trust).To(Equal("pcp")) + Expect(pfc).To(Equal("0,0,0,0,0,0,0,0")) + }) + + It("should calculate trust and pfc when RoceOptimized is enabled with Qos", func() { + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + PciPerformanceOptimized: nil, + RoceOptimized: &v1alpha1.RoceOptimizedSpec{ + Enabled: true, + Qos: &v1alpha1.QosSpec{ + Trust: "dscp", + PFC: "0,1,0,1,0,0,0,0", + }, + }, + }, + }, + }, + } + + maxReadRequestSize, trust, pfc := validator.CalculateDesiredRuntimeConfig(device) + Expect(maxReadRequestSize).To(Equal(0)) + Expect(trust).To(Equal("dscp")) + Expect(pfc).To(Equal("0,1,0,1,0,0,0,0")) + }) + + It("should default trust and pfc when RoceOptimized is enabled without Qos", func() { + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + PciPerformanceOptimized: nil, + RoceOptimized: &v1alpha1.RoceOptimizedSpec{ + Enabled: true, + Qos: nil, + }, + }, + }, + }, + } + + maxReadRequestSize, trust, pfc := validator.CalculateDesiredRuntimeConfig(device) + Expect(maxReadRequestSize).To(Equal(0)) + Expect(trust).To(Equal("dscp")) + Expect(pfc).To(Equal("0,0,0,1,0,0,0,0")) + }) + + It("should prioritize RoceOptimized settings over defaults when both optimizations are enabled", func() { + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + MaxReadRequest: 256, + }, + RoceOptimized: &v1alpha1.RoceOptimizedSpec{ + Enabled: true, + Qos: &v1alpha1.QosSpec{ + Trust: "customTrust", + PFC: "1,1,1,1,1,1,1,1", + }, + }, + }, + }, + }, + } + + maxReadRequestSize, trust, pfc := validator.CalculateDesiredRuntimeConfig(device) + Expect(maxReadRequestSize).To(Equal(256)) + Expect(trust).To(Equal("customTrust")) + Expect(pfc).To(Equal("1,1,1,1,1,1,1,1")) + }) + }) + + Describe("RuntimeConfigApplied", func() { + var ( + device *v1alpha1.NicDevice + applied bool + err error + ) + + BeforeEach(func() { + device = &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{}, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: "0000:03:00.0", NetworkInterface: "interface0"}, + {PCI: "0000:03:00.1", NetworkInterface: "interface1"}, + }, + }, + } + }) + + Context("when desired runtime config is applied correctly on all ports", func() { + BeforeEach(func() { + desiredMaxReadReqSize, desiredTrust, desiredPfc := validator.CalculateDesiredRuntimeConfig(device) + + mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.0").Return(desiredMaxReadReqSize, nil) + mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.1").Return(desiredMaxReadReqSize, nil) + + mockHostUtils.On("GetTrustAndPFC", "interface0").Return(desiredTrust, desiredPfc, nil) + mockHostUtils.On("GetTrustAndPFC", "interface1").Return(desiredTrust, desiredPfc, nil) + }) + + It("should return true with no error", func() { + applied, err = validator.RuntimeConfigApplied(device) + Expect(err).NotTo(HaveOccurred()) + Expect(applied).To(BeTrue()) + }) + }) + + Context("when desiredMaxReadRequestSize does not match on the first port", func() { + BeforeEach(func() { + device := device + device.Spec = v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + MaxReadRequest: 2048, + }, + }, + }, + } + desiredMaxReadReqSize, desiredTrust, desiredPfc := validator.CalculateDesiredRuntimeConfig(device) + + mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.0").Return(desiredMaxReadReqSize+128, nil) + + mockHostUtils.On("GetTrustAndPFC", "interface0").Return(desiredTrust, desiredPfc, nil) + mockHostUtils.On("GetTrustAndPFC", "interface1").Return(desiredTrust, desiredPfc, nil) + + // The second port should not be called since the first port already fails + }) + + It("should return false with no error", func() { + applied, err = validator.RuntimeConfigApplied(device) + Expect(err).NotTo(HaveOccurred()) + Expect(applied).To(BeFalse()) + }) + }) + + Context("when desiredMaxReadRequestSize does not match on the second port", func() { + BeforeEach(func() { + device := device + device.Spec = v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + MaxReadRequest: 2048, + }, + }, + }, + } + + desiredMaxReadReqSize, desiredTrust, desiredPfc := validator.CalculateDesiredRuntimeConfig(device) + + mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.0").Return(desiredMaxReadReqSize, nil) + mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.1").Return(desiredMaxReadReqSize+256, nil) + + mockHostUtils.On("GetTrustAndPFC", "interface0").Return(desiredTrust, desiredPfc, nil) + }) + + It("should return false with no error", func() { + applied, err = validator.RuntimeConfigApplied(device) + Expect(err).NotTo(HaveOccurred()) + Expect(applied).To(BeFalse()) + }) + }) + + // TODO uncomment after a fix to mlnx_qos command + //Context("when trust setting does not match on the first port", func() { + // BeforeEach(func() { + // desiredMaxReadReqSize, _, desiredPfc := validator.CalculateDesiredRuntimeConfig(device) + // + // mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.0").Return(desiredMaxReadReqSize, nil) + // mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.1").Return(desiredMaxReadReqSize, nil) + // + // mockHostUtils.On("GetTrustAndPFC", "interface0").Return("differentTrust", desiredPfc, nil) + // // The second port should not be called since the first port already fails + // }) + // + // It("should return false with no error", func() { + // applied, err = validator.RuntimeConfigApplied(device) + // Expect(err).NotTo(HaveOccurred()) + // Expect(applied).To(BeFalse()) + // }) + //}) + // + // TODO uncomment after a fix to mlnx_qos command + //Context("when PFC setting does not match on the second port", func() { + // BeforeEach(func() { + // desiredMaxReadReqSize, desiredTrust, desiredPfc := validator.CalculateDesiredRuntimeConfig(device) + // + // mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.0").Return(desiredMaxReadReqSize, nil) + // mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.1").Return(desiredMaxReadReqSize, nil) + // + // mockHostUtils.On("GetTrustAndPFC", "interface0").Return(desiredTrust, desiredPfc, nil) + // + // mockHostUtils.On("GetTrustAndPFC", "interface1").Return(desiredTrust, "differentPfc", nil) + // }) + // + // It("should return false with no error", func() { + // applied, err = validator.RuntimeConfigApplied(device) + // Expect(err).NotTo(HaveOccurred()) + // Expect(applied).To(BeFalse()) + // }) + //}) + + Context("when GetMaxReadRequestSize returns an error", func() { + BeforeEach(func() { + device := device + device.Spec = v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + MaxReadRequest: 2048, + }, + }, + }, + } + + _, _, _ = validator.CalculateDesiredRuntimeConfig(device) + + mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.0").Return(0, fmt.Errorf("command failed")) + }) + + It("should return false with the error", func() { + applied, err = validator.RuntimeConfigApplied(device) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("command failed")) + Expect(applied).To(BeFalse()) + }) + }) + + // TODO uncomment after a fix to mlnx_qos command + //Context("when GetTrustAndPFC returns an error on the first port", func() { + // BeforeEach(func() { + // desiredMaxReadReqSize, _, _ := validator.CalculateDesiredRuntimeConfig(device) + // + // mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.0").Return(desiredMaxReadReqSize, nil) + // mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.1").Return(desiredMaxReadReqSize, nil) + // + // mockHostUtils.On("GetTrustAndPFC", "interface0").Return("", "", fmt.Errorf("failed to get trust and pfc")) + // }) + // + // It("should return false with the error", func() { + // applied, err = validator.RuntimeConfigApplied(device) + // Expect(err).To(HaveOccurred()) + // Expect(err.Error()).To(ContainSubstring("failed to get trust and pfc")) + // Expect(applied).To(BeFalse()) + // }) + //}) + + Context("when device has a single port and all settings are applied correctly", func() { + BeforeEach(func() { + device := device + device.Status.Ports = []v1alpha1.NicDevicePortSpec{ + {PCI: "0000:03:00.0", NetworkInterface: "interface0"}, + } + + desiredMaxReadReqSize, desiredTrust, desiredPfc := validator.CalculateDesiredRuntimeConfig(device) + + mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.0").Return(desiredMaxReadReqSize, nil) + mockHostUtils.On("GetTrustAndPFC", "interface0").Return(desiredTrust, desiredPfc, nil) + }) + + It("should return true with no error", func() { + applied, err = validator.RuntimeConfigApplied(device) + Expect(err).NotTo(HaveOccurred()) + Expect(applied).To(BeTrue()) + }) + }) + + // TODO uncomment after a fix to mlnx_qos command + //Context("when device has a single port and trust setting does not match", func() { + // BeforeEach(func() { + // device := device + // device.Status.Ports = []v1alpha1.NicDevicePortSpec{ + // {PCI: "0000:03:00.0", NetworkInterface: "interface0"}, + // } + // + // desiredMaxReadReqSize, _, desiredPfc := validator.CalculateDesiredRuntimeConfig(device) + // + // mockHostUtils.On("GetMaxReadRequestSize", "0000:03:00.0").Return(desiredMaxReadReqSize, nil) + // mockHostUtils.On("GetTrustAndPFC", "interface0").Return("differentTrust", desiredPfc, nil) + // }) + // + // It("should return false with no error", func() { + // applied, err = validator.RuntimeConfigApplied(device) + // Expect(err).NotTo(HaveOccurred()) + // Expect(applied).To(BeFalse()) + // }) + //}) + }) +}) diff --git a/pkg/host/host.go b/pkg/host/host.go index 21a562f..648ce1e 100644 --- a/pkg/host/host.go +++ b/pkg/host/host.go @@ -16,8 +16,11 @@ limitations under the License. package host import ( + "context" + "fmt" "strconv" + "github.com/Mellanox/nic-configuration-operator/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" @@ -28,10 +31,24 @@ import ( type HostManager interface { // DiscoverNicDevices discovers Nvidia NIC devices on the host and returns back a map of serial numbers to device statuses DiscoverNicDevices() (map[string]v1alpha1.NicDeviceStatus, error) + // ValidateDeviceNvSpec will validate device's non-volatile spec against already applied configuration on the host + // returns bool - nv config update required + // returns bool - reboot required + // returns error - there are errors in device's spec + ValidateDeviceNvSpec(ctx context.Context, device *v1alpha1.NicDevice) (bool, bool, error) + // ApplyDeviceNvSpec calculates device's missing nv spec configuration and applies it to the device on the host + // returns bool - reboot required + // returns error - there were errors while applying nv configuration + ApplyDeviceNvSpec(ctx context.Context, device *v1alpha1.NicDevice) (bool, error) + // ApplyDeviceRuntimeSpec calculates device's missing runtime spec configuration and applies it to the device on the host + // returns error - there were errors while applying nv configuration + ApplyDeviceRuntimeSpec(device *v1alpha1.NicDevice) error } type hostManager struct { - hostUtils HostUtils + nodeName string + hostUtils HostUtils + configValidation configValidation } // DiscoverNicDevices uses host utils to discover Nvidia NIC devices on the host and returns back a map of serial numbers to device statuses @@ -107,12 +124,199 @@ func (h hostManager) DiscoverNicDevices() (map[string]v1alpha1.NicDeviceStatus, RdmaInterface: rdmaInterface, }) + deviceStatus.Node = h.nodeName devices[deviceStatus.SerialNumber] = deviceStatus } return devices, nil } -func NewHostManager(hostUtils HostUtils) HostManager { - return hostManager{hostUtils: hostUtils} +// ValidateDeviceNvSpec will validate device's non-volatile spec against already applied configuration on the host +// returns bool - nv config update required +// returns bool - reboot required +// returns error - there are errors in device's spec +// if fully matches in current and next config, returns false, false +// if fully matched next but not current, returns false, true +// if not fully matched next boot, returns true, true +func (h hostManager) ValidateDeviceNvSpec(ctx context.Context, device *v1alpha1.NicDevice) (bool, bool, error) { + log.Log.Info("hostManager.ValidateDeviceNvSpec", "device", device.Name) + + nvConfig, err := h.hostUtils.QueryNvConfig(ctx, device.Status.Ports[0].PCI) + if err != nil { + log.Log.Error(err, "failed to query nv config", "device", device.Name) + return false, false, err + } + + if device.Spec.Configuration.ResetToDefault { + return h.configValidation.ValidateResetToDefault(nvConfig) + } + + desiredConfig, err := h.configValidation.ConstructNvParamMapFromTemplate(device, nvConfig.DefaultConfig) + if err != nil { + log.Log.Error(err, "failed to calculate desired nvconfig parameters", "device", device.Name) + return false, false, err + } + + configUpdateNeeded := false + rebootNeeded := false + + // If ADVANCED_PCI_SETTINGS are enabled in current config, unknown parameters are treated as spec error + advancedPciSettingsEnabled := h.configValidation.AdvancedPCISettingsEnabled(nvConfig.CurrentConfig) + + for parameter, desiredValue := range desiredConfig { + currentValue, foundInCurrent := nvConfig.CurrentConfig[parameter] + nextValue, foundInNextBoot := nvConfig.NextBootConfig[parameter] + if advancedPciSettingsEnabled && !foundInCurrent { + err = types.IncorrectSpecError(fmt.Sprintf("Parameter %s unsupported for device %s", parameter, device.Name)) + log.Log.Error(err, "can't set nv config parameter for device") + return false, false, err + } + + if foundInNextBoot && nextValue == desiredValue { + if !foundInCurrent || currentValue != desiredValue { + rebootNeeded = true + } + } else { + configUpdateNeeded = true + rebootNeeded = true + } + } + + return configUpdateNeeded, rebootNeeded, nil +} + +// ApplyDeviceNvSpec calculates device's missing nv spec configuration and applies it to the device on the host +// returns bool - reboot required +// returns error - there were errors while applying nv configuration +func (h hostManager) ApplyDeviceNvSpec(ctx context.Context, device *v1alpha1.NicDevice) (bool, error) { + log.Log.Info("hostManager.ApplyDeviceNvSpec", "device", device.Name) + + pciAddr := device.Status.Ports[0].PCI + + if device.Spec.Configuration.ResetToDefault { + log.Log.Info("resetting nv config to default", "device", device.Name) // todo + err := h.hostUtils.ResetNvConfig(pciAddr) + if err != nil { + log.Log.Error(err, "Failed to reset nv config", "device", device.Name) + return false, err + } + + err = h.hostUtils.SetNvConfigParameter(pciAddr, consts.AdvancedPCISettingsParam, consts.NvParamTrue) + if err != nil { + log.Log.Error(err, "Failed to apply nv config parameter", "device", device.Name, "param", consts.AdvancedPCISettingsParam, "value", consts.NvParamTrue) + return false, err + } + + return true, err + } + + nvConfig, err := h.hostUtils.QueryNvConfig(ctx, device.Status.Ports[0].PCI) + if err != nil { + log.Log.Error(err, "failed to query nv config", "device", device.Name) + return false, err + } + + if !h.configValidation.AdvancedPCISettingsEnabled(nvConfig.CurrentConfig) { + log.Log.V(2).Info("AdvancedPciSettings not enabled, fw reset required", "device", device.Name) + err = h.hostUtils.SetNvConfigParameter(pciAddr, consts.AdvancedPCISettingsParam, consts.NvParamTrue) + if err != nil { + log.Log.Error(err, "Failed to apply nv config parameter", "device", device.Name, "param", consts.AdvancedPCISettingsParam, "value", consts.NvParamTrue) + return false, err + } + + err = h.hostUtils.ResetNicFirmware(ctx, pciAddr) + if err != nil { + log.Log.Error(err, "Failed to reset NIC firmware", "device", device.Name) + return false, err + } + + // Query nv config again, additional options could become available + nvConfig, err = h.hostUtils.QueryNvConfig(ctx, device.Status.Ports[0].PCI) + if err != nil { + log.Log.Error(err, "failed to query nv config", "device", device.Name) + return false, err + } + } + + desiredConfig, err := h.configValidation.ConstructNvParamMapFromTemplate(device, nvConfig.DefaultConfig) + if err != nil { + log.Log.Error(err, "failed to calculate desired nvconfig parameters", "device", device.Name) + return false, err + } + + paramsToApply := map[string]string{} + + for param, value := range desiredConfig { + nextVal, found := nvConfig.NextBootConfig[param] + if !found { + err = types.IncorrectSpecError(fmt.Sprintf("Parameter %s unsupported for device %s", param, device.Name)) + log.Log.Error(err, "can't set nv config parameter for device") + return false, err + } + + if nextVal != value { + paramsToApply[param] = value + } + } + + log.Log.V(2).Info("applying nv config to device", "device", device.Name, "config", paramsToApply) + + for param, value := range paramsToApply { + err = h.hostUtils.SetNvConfigParameter(pciAddr, param, value) + if err != nil { + log.Log.Error(err, "Failed to apply nv config parameter", "device", device.Name, "param", param, "value", value) + return false, err + } + } + + log.Log.V(2).Info("nv config successfully applied to device", "device", device.Name) + + return true, nil +} + +// ApplyDeviceRuntimeSpec calculates device's missing runtime spec configuration and applies it to the device on the host +// returns error - there were errors while applying nv configuration +func (h hostManager) ApplyDeviceRuntimeSpec(device *v1alpha1.NicDevice) error { + log.Log.Info("hostManager.ApplyDeviceRuntimeSpec", "device", device.Name) + + alreadyApplied, err := h.configValidation.RuntimeConfigApplied(device) + if err != nil { + log.Log.Error(err, "failed to verify runtime configuration", "device", device) + } + + if alreadyApplied { + log.Log.V(2).Info("runtime config already applied", "device", device) + return nil + } + + // TODO uncomment after a fix to mlnx_qos command + //desiredMaxReadReqSize, desiredTrust, desiredPfc := h.configValidation.CalculateDesiredRuntimeConfig(device) + desiredMaxReadReqSize, _, _ := h.configValidation.CalculateDesiredRuntimeConfig(device) + + ports := device.Status.Ports + + if desiredMaxReadReqSize != 0 { + for _, port := range ports { + err = h.hostUtils.SetMaxReadRequestSize(port.PCI, desiredMaxReadReqSize) + if err != nil { + log.Log.Error(err, "failed to apply runtime configuration", "device", device) + return err + } + } + } + + // TODO uncomment after a fix to mlnx_qos command + //for _, port := range ports { + // err = h.hostUtils.SetTrustAndPFC(port.NetworkInterface, desiredTrust, desiredPfc) + // if err != nil { + // log.Log.Error(err, "failed to apply runtime configuration", "device", device) + // return err + // } + //} + + return nil +} + +func NewHostManager(nodeName string, hostUtils HostUtils) HostManager { + return hostManager{nodeName: nodeName, hostUtils: hostUtils, configValidation: newConfigValidation(hostUtils)} } diff --git a/pkg/host/host_test.go b/pkg/host/host_test.go index f38d69e..cf33d29 100644 --- a/pkg/host/host_test.go +++ b/pkg/host/host_test.go @@ -16,11 +16,14 @@ limitations under the License. package host import ( + "context" "errors" + "fmt" "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" "github.com/Mellanox/nic-configuration-operator/pkg/consts" "github.com/Mellanox/nic-configuration-operator/pkg/host/mocks" + "github.com/Mellanox/nic-configuration-operator/pkg/types" "github.com/jaypipes/ghw/pkg/pci" "github.com/jaypipes/pcidb" . "github.com/onsi/ginkgo/v2" @@ -30,20 +33,21 @@ import ( var _ = Describe("HostManager", func() { var ( mockHostUtils mocks.HostUtils - hostManager HostManager + manager hostManager ) BeforeEach(func() { mockHostUtils = mocks.HostUtils{} - hostManager = NewHostManager(&mockHostUtils) + manager = hostManager{hostUtils: &mockHostUtils} }) Describe("DiscoverNicDevices", func() { Context("when GetPCIDevices fails", func() { It("should return nil and log an error", func() { - mockHostUtils.On("GetPCIDevices").Return(nil, errors.New("get PCI devices error")) + mockHostUtils.On("GetPCIDevices"). + Return(nil, errors.New("get PCI devices error")) - devices, err := hostManager.DiscoverNicDevices() + devices, err := manager.DiscoverNicDevices() Expect(err).To(HaveOccurred()) Expect(devices).To(BeNil()) mockHostUtils.AssertExpectations(GinkgoT()) @@ -61,7 +65,7 @@ var _ = Describe("HostManager", func() { }, }, nil) - devices, err := hostManager.DiscoverNicDevices() + devices, err := manager.DiscoverNicDevices() Expect(err).NotTo(HaveOccurred()) Expect(devices).To(BeEmpty()) mockHostUtils.AssertExpectations(GinkgoT()) @@ -79,7 +83,7 @@ var _ = Describe("HostManager", func() { }, }, nil) - devices, err := hostManager.DiscoverNicDevices() + devices, err := manager.DiscoverNicDevices() Expect(err).NotTo(HaveOccurred()) Expect(devices).To(BeEmpty()) mockHostUtils.AssertExpectations(GinkgoT()) @@ -101,7 +105,7 @@ var _ = Describe("HostManager", func() { It("should log and skip devices if IsSriovVF returns true", func() { mockHostUtils.On("IsSriovVF", "0000:00:00.0").Return(true) - devices, err := hostManager.DiscoverNicDevices() + devices, err := manager.DiscoverNicDevices() Expect(err).NotTo(HaveOccurred()) Expect(devices).To(BeEmpty()) mockHostUtils.AssertExpectations(GinkgoT()) @@ -109,9 +113,10 @@ var _ = Describe("HostManager", func() { It("should fails if GetPartAndSerialNumber fails", func() { mockHostUtils.On("IsSriovVF", "0000:00:00.0").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0").Return("", "", errors.New("serial number error")) + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0"). + Return("", "", errors.New("serial number error")) - devices, err := hostManager.DiscoverNicDevices() + devices, err := manager.DiscoverNicDevices() Expect(err).To(HaveOccurred()) Expect(devices).To(BeNil()) mockHostUtils.AssertExpectations(GinkgoT()) @@ -119,10 +124,12 @@ var _ = Describe("HostManager", func() { It("should log and skip devices if GetFirmwareVersionAndPSID fails", func() { mockHostUtils.On("IsSriovVF", "0000:00:00.0").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0").Return("part-number", "serial-number", nil) - mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0").Return("", "", errors.New("firmware error")) + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0"). + Return("part-number", "serial-number", nil) + mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0"). + Return("", "", errors.New("firmware error")) - devices, err := hostManager.DiscoverNicDevices() + devices, err := manager.DiscoverNicDevices() Expect(err).To(HaveOccurred()) Expect(devices).To(BeNil()) mockHostUtils.AssertExpectations(GinkgoT()) @@ -130,12 +137,16 @@ var _ = Describe("HostManager", func() { It("should discover and return devices successfully", func() { mockHostUtils.On("IsSriovVF", "0000:00:00.0").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0").Return("part-number", "serial-number", nil) - mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0").Return("fw-version", "psid", nil) - mockHostUtils.On("GetInterfaceName", "0000:00:00.0").Return("eth0") - mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0").Return("mlx5_0") - - devices, err := hostManager.DiscoverNicDevices() + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0"). + Return("part-number", "serial-number", nil) + mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0"). + Return("fw-version", "psid", nil) + mockHostUtils.On("GetInterfaceName", "0000:00:00.0"). + Return("eth0") + mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0"). + Return("mlx5_0") + + devices, err := manager.DiscoverNicDevices() Expect(err).NotTo(HaveOccurred()) expectedDeviceStatus := v1alpha1.NicDeviceStatus{ @@ -180,15 +191,20 @@ var _ = Describe("HostManager", func() { }) It("should log and skip only a faulty device if IsSriovVF returns true", func() { - mockHostUtils.On("IsSriovVF", "0000:00:00.0").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0").Return("part-number", "serial-number", nil) - mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0").Return("fw-version", "psid", nil) - mockHostUtils.On("GetInterfaceName", "0000:00:00.0").Return("eth0") - mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0").Return("mlx5_0") + mockHostUtils.On("IsSriovVF", "0000:00:00.0"). + Return(false) + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0"). + Return("part-number", "serial-number", nil) + mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0"). + Return("fw-version", "psid", nil) + mockHostUtils.On("GetInterfaceName", "0000:00:00.0"). + Return("eth0") + mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0"). + Return("mlx5_0") mockHostUtils.On("IsSriovVF", "0000:00:00.1").Return(true) - devices, err := hostManager.DiscoverNicDevices() + devices, err := manager.DiscoverNicDevices() Expect(err).NotTo(HaveOccurred()) expectedDeviceStatus := v1alpha1.NicDeviceStatus{ Type: "test-id", @@ -211,52 +227,77 @@ var _ = Describe("HostManager", func() { }) It("should log and skip only a faulty device if GetPartAndSerialNumber fails", func() { - mockHostUtils.On("IsSriovVF", "0000:00:00.0").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0").Return("part-number", "serial-number", nil) - mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0").Return("fw-version", "psid", nil) - mockHostUtils.On("GetInterfaceName", "0000:00:00.0").Return("eth0") - mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0").Return("mlx5_0") - - mockHostUtils.On("IsSriovVF", "0000:00:00.1").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.1").Return("", "", errors.New("serial number error")) - - devices, err := hostManager.DiscoverNicDevices() + mockHostUtils.On("IsSriovVF", "0000:00:00.0"). + Return(false) + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0"). + Return("part-number", "serial-number", nil) + mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0"). + Return("fw-version", "psid", nil) + mockHostUtils.On("GetInterfaceName", "0000:00:00.0"). + Return("eth0") + mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0"). + Return("mlx5_0") + + mockHostUtils.On("IsSriovVF", "0000:00:00.1"). + Return(false) + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.1"). + Return("", "", errors.New("serial number error")) + + devices, err := manager.DiscoverNicDevices() Expect(err).To(HaveOccurred()) Expect(devices).To(BeNil()) mockHostUtils.AssertExpectations(GinkgoT()) }) It("should log and skip only a faulty device if GetFirmwareVersionAndPSID fails", func() { - mockHostUtils.On("IsSriovVF", "0000:00:00.0").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0").Return("part-number", "serial-number", nil) - mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0").Return("fw-version", "psid", nil) - mockHostUtils.On("GetInterfaceName", "0000:00:00.0").Return("eth0") - mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0").Return("mlx5_0") - - mockHostUtils.On("IsSriovVF", "0000:00:00.1").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.1").Return("part-number", "serial-number-2", nil) - mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.1").Return("", "", errors.New("firmware error")) - - devices, err := hostManager.DiscoverNicDevices() + mockHostUtils.On("IsSriovVF", "0000:00:00.0"). + Return(false) + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0"). + Return("part-number", "serial-number", nil) + mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0"). + Return("fw-version", "psid", nil) + mockHostUtils.On("GetInterfaceName", "0000:00:00.0"). + Return("eth0") + mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0"). + Return("mlx5_0") + + mockHostUtils.On("IsSriovVF", "0000:00:00.1"). + Return(false) + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.1"). + Return("part-number", "serial-number-2", nil) + mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.1"). + Return("", "", errors.New("firmware error")) + + devices, err := manager.DiscoverNicDevices() Expect(err).To(HaveOccurred()) Expect(devices).To(BeNil()) mockHostUtils.AssertExpectations(GinkgoT()) }) It("should discover and return devices successfully", func() { - mockHostUtils.On("IsSriovVF", "0000:00:00.0").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0").Return("part-number", "serial-number", nil) - mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0").Return("fw-version", "psid", nil) - mockHostUtils.On("GetInterfaceName", "0000:00:00.0").Return("eth0") - mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0").Return("mlx5_0") - - mockHostUtils.On("IsSriovVF", "0000:00:00.1").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.1").Return("part-number", "serial-number-2", nil) - mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.1").Return("fw-version", "psid", nil) - mockHostUtils.On("GetInterfaceName", "0000:00:00.1").Return("eth1") - mockHostUtils.On("GetRDMADeviceName", "0000:00:00.1").Return("mlx5_1") - - devices, err := hostManager.DiscoverNicDevices() + mockHostUtils.On("IsSriovVF", "0000:00:00.0"). + Return(false) + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0"). + Return("part-number", "serial-number", nil) + mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0"). + Return("fw-version", "psid", nil) + mockHostUtils.On("GetInterfaceName", "0000:00:00.0"). + Return("eth0") + mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0"). + Return("mlx5_0") + + mockHostUtils.On("IsSriovVF", "0000:00:00.1"). + Return(false) + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.1"). + Return("part-number", "serial-number-2", nil) + mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.1"). + Return("fw-version", "psid", nil) + mockHostUtils.On("GetInterfaceName", "0000:00:00.1"). + Return("eth1") + mockHostUtils.On("GetRDMADeviceName", "0000:00:00.1"). + Return("mlx5_1") + + devices, err := manager.DiscoverNicDevices() Expect(err).NotTo(HaveOccurred()) expectedDeviceStatus1 := v1alpha1.NicDeviceStatus{ @@ -316,19 +357,28 @@ var _ = Describe("HostManager", func() { }, }, nil) - mockHostUtils.On("IsSriovVF", "0000:00:00.0").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0").Return("part-number", sameSerialNumber, nil) - mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0").Return("fw-version", "psid", nil) - mockHostUtils.On("GetInterfaceName", "0000:00:00.0").Return("eth0") - mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0").Return("mlx5_0") - - mockHostUtils.On("IsSriovVF", "0000:00:00.1").Return(false) - mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.1").Return("part-number", sameSerialNumber, nil) + mockHostUtils.On("IsSriovVF", "0000:00:00.0"). + Return(false) + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.0"). + Return("part-number", sameSerialNumber, nil) + mockHostUtils.On("GetFirmwareVersionAndPSID", "0000:00:00.0"). + Return("fw-version", "psid", nil) + mockHostUtils.On("GetInterfaceName", "0000:00:00.0"). + Return("eth0") + mockHostUtils.On("GetRDMADeviceName", "0000:00:00.0"). + Return("mlx5_0") + + mockHostUtils.On("IsSriovVF", "0000:00:00.1"). + Return(false) + mockHostUtils.On("GetPartAndSerialNumber", "0000:00:00.1"). + Return("part-number", sameSerialNumber, nil) mockHostUtils.AssertNotCalled(GinkgoT(), "GetFirmwareVersionAndPSID", "0000:00:00.1") - mockHostUtils.On("GetInterfaceName", "0000:00:00.1").Return("eth1") - mockHostUtils.On("GetRDMADeviceName", "0000:00:00.1").Return("mlx5_1") + mockHostUtils.On("GetInterfaceName", "0000:00:00.1"). + Return("eth1") + mockHostUtils.On("GetRDMADeviceName", "0000:00:00.1"). + Return("mlx5_1") - devices, err := hostManager.DiscoverNicDevices() + devices, err := manager.DiscoverNicDevices() Expect(err).NotTo(HaveOccurred()) expectedDeviceStatus := v1alpha1.NicDeviceStatus{ Type: "test-id", @@ -356,4 +406,615 @@ var _ = Describe("HostManager", func() { mockHostUtils.AssertExpectations(GinkgoT()) }) }) + Describe("hostManager.ValidateDeviceNvSpec", func() { + var ( + mockHostUtils mocks.HostUtils + mockConfigValidation mocks.ConfigValidation + manager hostManager + ctx context.Context + device *v1alpha1.NicDevice + pciAddress string + ) + + BeforeEach(func() { + mockHostUtils = mocks.HostUtils{} + mockConfigValidation = mocks.ConfigValidation{} + manager = hostManager{ + hostUtils: &mockHostUtils, + configValidation: &mockConfigValidation, + } + ctx = context.TODO() + pciAddress = "0000:3b:00.0" + + device = &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + ResetToDefault: false, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: pciAddress}, + }, + }, + } + }) + + Describe("ValidateDeviceNvSpec", func() { + Context("when QueryNvConfig returns an error", func() { + It("should return false, false, and the error", func() { + queryErr := errors.New("failed to query nv config") + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(types.NewNvConfigQuery(), queryErr) + + configUpdate, reboot, err := manager.ValidateDeviceNvSpec(ctx, device) + Expect(configUpdate).To(BeFalse()) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(queryErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + }) + }) + + Context("when ResetToDefault is true", func() { + BeforeEach(func() { + device.Spec.Configuration.ResetToDefault = true + }) + + It("should call ValidateResetToDefault and return its results", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "value1"}, + NextBootConfig: map[string]string{"param1": "value1"}, + DefaultConfig: map[string]string{"param1": "default1"}, + } + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("ValidateResetToDefault", nvConfig). + Return(true, false, nil) + + configUpdate, reboot, err := manager.ValidateDeviceNvSpec(ctx, device) + Expect(configUpdate).To(BeTrue()) + Expect(reboot).To(BeFalse()) + Expect(err).To(BeNil()) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + + It("should return an error if ValidateResetToDefault fails", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{}, + NextBootConfig: map[string]string{}, + DefaultConfig: map[string]string{}, + } + validationErr := errors.New("validation failed") + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("ValidateResetToDefault", nvConfig). + Return(false, false, validationErr) + + configUpdate, reboot, err := manager.ValidateDeviceNvSpec(ctx, device) + Expect(configUpdate).To(BeFalse()) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(validationErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + }) + + Context("when ConstructNvParamMapFromTemplate returns an error", func() { + It("should return false, false, and the error", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{}, + NextBootConfig: map[string]string{}, + DefaultConfig: map[string]string{}, + } + constructErr := errors.New("failed to construct desired config") + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(nil, constructErr) + + configUpdate, reboot, err := manager.ValidateDeviceNvSpec(ctx, device) + Expect(configUpdate).To(BeFalse()) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(constructErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + }) + + //nolint:dupl + Context("when desiredConfig fully matches current and next config", func() { + It("should return false, false, nil", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "value1", "param2": "value2"}, + NextBootConfig: map[string]string{"param1": "value1", "param2": "value2"}, + DefaultConfig: map[string]string{"param1": "default1", "param2": "default2"}, + } + desiredConfig := map[string]string{"param1": "value1", "param2": "value2"} + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(desiredConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(false) + + configUpdate, reboot, err := manager.ValidateDeviceNvSpec(ctx, device) + Expect(configUpdate).To(BeFalse()) + Expect(reboot).To(BeFalse()) + Expect(err).To(BeNil()) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + }) + + //nolint:dupl + Context("when desiredConfig fully matches next but not current config", func() { + It("should return false, true, nil", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "oldValue1", "param2": "value2"}, + NextBootConfig: map[string]string{"param1": "value1", "param2": "value2"}, + DefaultConfig: map[string]string{"param1": "default1", "param2": "default2"}, + } + desiredConfig := map[string]string{"param1": "value1", "param2": "value2"} + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(desiredConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(false) + + configUpdate, reboot, err := manager.ValidateDeviceNvSpec(ctx, device) + Expect(configUpdate).To(BeFalse()) + Expect(reboot).To(BeTrue()) + Expect(err).To(BeNil()) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + }) + + //nolint:dupl + Context("when desiredConfig does not fully match next boot config", func() { + It("should return true, true, nil", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "oldValue1", "param2": "value2"}, + NextBootConfig: map[string]string{"param1": "wrongValue", "param2": "value2"}, + DefaultConfig: map[string]string{"param1": "default1", "param2": "default2"}, + } + desiredConfig := map[string]string{"param1": "value1", "param2": "value2"} + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(desiredConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(false) + + configUpdate, reboot, err := manager.ValidateDeviceNvSpec(ctx, device) + Expect(configUpdate).To(BeTrue()) + Expect(reboot).To(BeTrue()) + Expect(err).To(BeNil()) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + }) + + Context("when AdvancedPCISettingsEnabled is true and a parameter is missing in CurrentConfig", func() { + It("should return an IncorrectSpecError", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param2": "value2"}, + NextBootConfig: map[string]string{"param1": "value1", "param2": "value2"}, + DefaultConfig: map[string]string{"param1": "default1", "param2": "default2"}, + } + desiredConfig := map[string]string{"param1": "value1", "param2": "value2"} + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(desiredConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(true) + + expectedErr := types.IncorrectSpecError( + fmt.Sprintf("Parameter %s unsupported for device %s", "param1", device.Name)) + + configUpdate, reboot, err := manager.ValidateDeviceNvSpec(ctx, device) + Expect(configUpdate).To(BeFalse()) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(expectedErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + }) + }) + }) + Describe("hostManager.ApplyDeviceNvSpec", func() { + var ( + mockHostUtils mocks.HostUtils + mockConfigValidation mocks.ConfigValidation + manager hostManager + ctx context.Context + device *v1alpha1.NicDevice + pciAddress string + ) + + BeforeEach(func() { + mockHostUtils = mocks.HostUtils{} + mockConfigValidation = mocks.ConfigValidation{} + manager = hostManager{ + hostUtils: &mockHostUtils, + configValidation: &mockConfigValidation, + } + ctx = context.TODO() + pciAddress = "0000:3b:00.0" + + device = &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + ResetToDefault: false, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: pciAddress}, + }, + }, + } + }) + + Describe("ApplyDeviceNvSpec", func() { + Context("when ResetToDefault is true", func() { + BeforeEach(func() { + device.Spec.Configuration.ResetToDefault = true + }) + + It("should reset NV config and set AdvancedPCISettings parameter successfully", func() { + mockHostUtils.On("ResetNvConfig", pciAddress).Return(nil) + mockHostUtils. + On("SetNvConfigParameter", pciAddress, consts.AdvancedPCISettingsParam, consts.NvParamTrue). + Return(nil) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeTrue()) + Expect(err).To(BeNil()) + + mockHostUtils.AssertExpectations(GinkgoT()) + }) + + It("should return error if ResetNvConfig fails", func() { + resetErr := errors.New("failed to reset nv config") + mockHostUtils.On("ResetNvConfig", pciAddress).Return(resetErr) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(resetErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + }) + + It("should return error if SetNvConfigParameter fails", func() { + mockHostUtils.On("ResetNvConfig", pciAddress).Return(nil) + setParamErr := errors.New("failed to set nv config parameter") + mockHostUtils. + On("SetNvConfigParameter", pciAddress, consts.AdvancedPCISettingsParam, consts.NvParamTrue). + Return(setParamErr) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(setParamErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + }) + }) + + Context("when ResetToDefault is false", func() { + Context("when QueryNvConfig returns an error", func() { + It("should return false and the error", func() { + queryErr := errors.New("failed to query nv config") + mockHostUtils.On("QueryNvConfig", ctx, pciAddress).Return(types.NewNvConfigQuery(), queryErr) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(queryErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + }) + }) + + Context("when AdvancedPCISettingsEnabled is false", func() { + It("should set AdvancedPCISettingsParam and reset NIC firmware successfully", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "value1"}, + NextBootConfig: map[string]string{"param1": "value1"}, + DefaultConfig: map[string]string{"param1": "default1"}, + } + desiredConfig := map[string]string{"param1": "value1"} + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(false) + mockHostUtils. + On("SetNvConfigParameter", pciAddress, consts.AdvancedPCISettingsParam, consts.NvParamTrue). + Return(nil) + mockHostUtils.On("ResetNicFirmware", ctx, pciAddress). + Return(nil) + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(desiredConfig, nil) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeTrue()) + Expect(err).To(BeNil()) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + + It("should return error if SetNvConfigParameter fails", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "value1"}, + NextBootConfig: map[string]string{"param1": "value1"}, + DefaultConfig: map[string]string{"param1": "default1"}, + } + mockHostUtils.On("QueryNvConfig", ctx, pciAddress).Return(nvConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(false) + setParamErr := errors.New("failed to set nv config parameter") + mockHostUtils. + On("SetNvConfigParameter", pciAddress, consts.AdvancedPCISettingsParam, consts.NvParamTrue). + Return(setParamErr) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(setParamErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + + It("should return error if ResetNicFirmware fails", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "value1"}, + NextBootConfig: map[string]string{"param1": "value1"}, + DefaultConfig: map[string]string{"param1": "default1"}, + } + mockHostUtils.On("QueryNvConfig", ctx, pciAddress).Return(nvConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(false) + mockHostUtils.On("SetNvConfigParameter", pciAddress, consts.AdvancedPCISettingsParam, consts.NvParamTrue). + Return(nil) + resetFirmwareErr := errors.New("failed to reset NIC firmware") + mockHostUtils.On("ResetNicFirmware", ctx, pciAddress).Return(resetFirmwareErr) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(resetFirmwareErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + + It("should return error if second QueryNvConfig fails", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "value1"}, + NextBootConfig: map[string]string{"param1": "value1"}, + DefaultConfig: map[string]string{"param1": "default1"}, + } + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil).Times(1) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(false) + mockHostUtils.On("SetNvConfigParameter", pciAddress, consts.AdvancedPCISettingsParam, consts.NvParamTrue). + Return(nil) + mockHostUtils.On("ResetNicFirmware", ctx, pciAddress). + Return(nil) + secondQueryErr := errors.New("failed to query nv config again") + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(types.NewNvConfigQuery(), secondQueryErr) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(secondQueryErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + }) + + Context("when AdvancedPCISettingsEnabled is true", func() { + It("should construct desiredConfig and apply no changes if desiredConfig matches NextBootConfig", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "value1"}, + NextBootConfig: map[string]string{"param1": "value1"}, + DefaultConfig: map[string]string{"param1": "default1"}, + } + desiredConfig := map[string]string{"param1": "value1"} + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(true) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(desiredConfig, nil) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeTrue()) + Expect(err).To(BeNil()) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + + It("should construct desiredConfig and apply necessary changes successfully", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "oldValue1"}, + NextBootConfig: map[string]string{"param1": "value1"}, + DefaultConfig: map[string]string{"param1": "default1"}, + } + desiredConfig := map[string]string{"param1": "value2"} + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(true) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(desiredConfig, nil) + mockHostUtils.On("SetNvConfigParameter", pciAddress, "param1", "value2"). + Return(nil) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeTrue()) + Expect(err).To(BeNil()) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + + It("should return error if ConstructNvParamMapFromTemplate fails", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "value1"}, + NextBootConfig: map[string]string{"param1": "value1"}, + DefaultConfig: map[string]string{"param1": "default1"}, + } + constructErr := errors.New("failed to construct desired config") + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(true) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(nil, constructErr) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(constructErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + + It("should return error if desiredConfig has a parameter not in NextBootConfig", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "value1"}, + NextBootConfig: map[string]string{"param1": "value1"}, + DefaultConfig: map[string]string{"param1": "default1"}, + } + desiredConfig := map[string]string{"param1": "value1", "param2": "value2"} + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(true) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(desiredConfig, nil) + + expectedErr := types.IncorrectSpecError( + fmt.Sprintf("Parameter %s unsupported for device %s", "param2", device.Name)) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(expectedErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + + It("should return error if SetNvConfigParameter fails while applying params", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "oldValue1"}, + NextBootConfig: map[string]string{"param1": "value1"}, + DefaultConfig: map[string]string{"param1": "default1"}, + } + desiredConfig := map[string]string{"param1": "value3"} + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(true) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(desiredConfig, nil) + setParamErr := errors.New("failed to set param1") + mockHostUtils.On("SetNvConfigParameter", pciAddress, "param1", "value3"). + Return(setParamErr) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeFalse()) + Expect(err).To(MatchError(setParamErr)) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + }) + }) + + Context("when applying multiple parameters", func() { + It("should apply all parameters successfully and require a reboot", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "oldValue1", "param2": "oldValue2"}, + NextBootConfig: map[string]string{"param1": "newValue1", "param2": "newValue2"}, + DefaultConfig: map[string]string{"param1": "default1", "param2": "default2"}, + } + desiredConfig := map[string]string{"param1": "newValue3", "param2": "newValue3"} + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(true) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(desiredConfig, nil) + mockHostUtils.On("SetNvConfigParameter", pciAddress, "param1", "newValue3"). + Return(nil) + mockHostUtils.On("SetNvConfigParameter", pciAddress, "param2", "newValue3"). + Return(nil) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeTrue()) + Expect(err).To(BeNil()) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + }) + + Context("when no parameters need to be applied", func() { + It("should return true without applying any parameters", func() { + nvConfig := types.NvConfigQuery{ + CurrentConfig: map[string]string{"param1": "value1"}, + NextBootConfig: map[string]string{"param1": "value1"}, + DefaultConfig: map[string]string{"param1": "default1"}, + } + desiredConfig := map[string]string{"param1": "value1"} + + mockHostUtils.On("QueryNvConfig", ctx, pciAddress). + Return(nvConfig, nil) + mockConfigValidation.On("AdvancedPCISettingsEnabled", nvConfig.CurrentConfig). + Return(true) + mockConfigValidation.On("ConstructNvParamMapFromTemplate", device, nvConfig.DefaultConfig). + Return(desiredConfig, nil) + + reboot, err := manager.ApplyDeviceNvSpec(ctx, device) + Expect(reboot).To(BeTrue()) + Expect(err).To(BeNil()) + + mockHostUtils.AssertExpectations(GinkgoT()) + mockConfigValidation.AssertExpectations(GinkgoT()) + }) + }) + }) + }) }) diff --git a/pkg/host/mocks/HostManager.go b/pkg/host/mocks/HostManager.go index c23158a..1f1a909 100644 --- a/pkg/host/mocks/HostManager.go +++ b/pkg/host/mocks/HostManager.go @@ -3,8 +3,11 @@ package mocks import ( - v1alpha1 "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" + context "context" + mock "github.com/stretchr/testify/mock" + + v1alpha1 "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" ) // HostManager is an autogenerated mock type for the HostManager type @@ -12,6 +15,52 @@ type HostManager struct { mock.Mock } +// ApplyDeviceNvSpec provides a mock function with given fields: ctx, device +func (_m *HostManager) ApplyDeviceNvSpec(ctx context.Context, device *v1alpha1.NicDevice) (bool, error) { + ret := _m.Called(ctx, device) + + if len(ret) == 0 { + panic("no return value specified for ApplyDeviceNvSpec") + } + + var r0 bool + var r1 error + if rf, ok := ret.Get(0).(func(context.Context, *v1alpha1.NicDevice) (bool, error)); ok { + return rf(ctx, device) + } + if rf, ok := ret.Get(0).(func(context.Context, *v1alpha1.NicDevice) bool); ok { + r0 = rf(ctx, device) + } else { + r0 = ret.Get(0).(bool) + } + + if rf, ok := ret.Get(1).(func(context.Context, *v1alpha1.NicDevice) error); ok { + r1 = rf(ctx, device) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// ApplyDeviceRuntimeSpec provides a mock function with given fields: device +func (_m *HostManager) ApplyDeviceRuntimeSpec(device *v1alpha1.NicDevice) error { + ret := _m.Called(device) + + if len(ret) == 0 { + panic("no return value specified for ApplyDeviceRuntimeSpec") + } + + var r0 error + if rf, ok := ret.Get(0).(func(*v1alpha1.NicDevice) error); ok { + r0 = rf(device) + } else { + r0 = ret.Error(0) + } + + return r0 +} + // DiscoverNicDevices provides a mock function with given fields: func (_m *HostManager) DiscoverNicDevices() (map[string]v1alpha1.NicDeviceStatus, error) { ret := _m.Called() @@ -42,6 +91,41 @@ func (_m *HostManager) DiscoverNicDevices() (map[string]v1alpha1.NicDeviceStatus return r0, r1 } +// ValidateDeviceNvSpec provides a mock function with given fields: ctx, device +func (_m *HostManager) ValidateDeviceNvSpec(ctx context.Context, device *v1alpha1.NicDevice) (bool, bool, error) { + ret := _m.Called(ctx, device) + + if len(ret) == 0 { + panic("no return value specified for ValidateDeviceNvSpec") + } + + var r0 bool + var r1 bool + var r2 error + if rf, ok := ret.Get(0).(func(context.Context, *v1alpha1.NicDevice) (bool, bool, error)); ok { + return rf(ctx, device) + } + if rf, ok := ret.Get(0).(func(context.Context, *v1alpha1.NicDevice) bool); ok { + r0 = rf(ctx, device) + } else { + r0 = ret.Get(0).(bool) + } + + if rf, ok := ret.Get(1).(func(context.Context, *v1alpha1.NicDevice) bool); ok { + r1 = rf(ctx, device) + } else { + r1 = ret.Get(1).(bool) + } + + if rf, ok := ret.Get(2).(func(context.Context, *v1alpha1.NicDevice) error); ok { + r2 = rf(ctx, device) + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} + // NewHostManager creates a new instance of HostManager. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. // The first argument is typically a *testing.T value. func NewHostManager(t interface { diff --git a/pkg/host/mocks/HostUtils.go b/pkg/host/mocks/HostUtils.go index fbcbbf1..8d12541 100644 --- a/pkg/host/mocks/HostUtils.go +++ b/pkg/host/mocks/HostUtils.go @@ -3,8 +3,13 @@ package mocks import ( - pci "github.com/jaypipes/ghw/pkg/pci" + context "context" + mock "github.com/stretchr/testify/mock" + + pci "github.com/jaypipes/ghw/pkg/pci" + + types "github.com/Mellanox/nic-configuration-operator/pkg/types" ) // HostUtils is an autogenerated mock type for the HostUtils type @@ -12,36 +17,6 @@ type HostUtils struct { mock.Mock } -// Chroot provides a mock function with given fields: path -func (_m *HostUtils) Chroot(path string) (func() error, error) { - ret := _m.Called(path) - - if len(ret) == 0 { - panic("no return value specified for Chroot") - } - - var r0 func() error - var r1 error - if rf, ok := ret.Get(0).(func(string) (func() error, error)); ok { - return rf(path) - } - if rf, ok := ret.Get(0).(func(string) func() error); ok { - r0 = rf(path) - } else { - if ret.Get(0) != nil { - r0 = ret.Get(0).(func() error) - } - } - - if rf, ok := ret.Get(1).(func(string) error); ok { - r1 = rf(path) - } else { - r1 = ret.Error(1) - } - - return r0, r1 -} - // GetFirmwareVersionAndPSID provides a mock function with given fields: pciAddr func (_m *HostUtils) GetFirmwareVersionAndPSID(pciAddr string) (string, string, error) { ret := _m.Called(pciAddr) @@ -113,6 +88,34 @@ func (_m *HostUtils) GetLinkType(name string) string { return r0 } +// GetMaxReadRequestSize provides a mock function with given fields: pciAddr +func (_m *HostUtils) GetMaxReadRequestSize(pciAddr string) (int, error) { + ret := _m.Called(pciAddr) + + if len(ret) == 0 { + panic("no return value specified for GetMaxReadRequestSize") + } + + var r0 int + var r1 error + if rf, ok := ret.Get(0).(func(string) (int, error)); ok { + return rf(pciAddr) + } + if rf, ok := ret.Get(0).(func(string) int); ok { + r0 = rf(pciAddr) + } else { + r0 = ret.Get(0).(int) + } + + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(pciAddr) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // GetPCIDevices provides a mock function with given fields: func (_m *HostUtils) GetPCIDevices() ([]*pci.Device, error) { ret := _m.Called() @@ -143,6 +146,34 @@ func (_m *HostUtils) GetPCIDevices() ([]*pci.Device, error) { return r0, r1 } +// GetPCILinkSpeed provides a mock function with given fields: pciAddr +func (_m *HostUtils) GetPCILinkSpeed(pciAddr string) (int, error) { + ret := _m.Called(pciAddr) + + if len(ret) == 0 { + panic("no return value specified for GetPCILinkSpeed") + } + + var r0 int + var r1 error + if rf, ok := ret.Get(0).(func(string) (int, error)); ok { + return rf(pciAddr) + } + if rf, ok := ret.Get(0).(func(string) int); ok { + r0 = rf(pciAddr) + } else { + r0 = ret.Get(0).(int) + } + + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(pciAddr) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // GetPartAndSerialNumber provides a mock function with given fields: pciAddr func (_m *HostUtils) GetPartAndSerialNumber(pciAddr string) (string, string, error) { ret := _m.Called(pciAddr) @@ -196,6 +227,41 @@ func (_m *HostUtils) GetRDMADeviceName(pciAddr string) string { return r0 } +// GetTrustAndPFC provides a mock function with given fields: interfaceName +func (_m *HostUtils) GetTrustAndPFC(interfaceName string) (string, string, error) { + ret := _m.Called(interfaceName) + + if len(ret) == 0 { + panic("no return value specified for GetTrustAndPFC") + } + + var r0 string + var r1 string + var r2 error + if rf, ok := ret.Get(0).(func(string) (string, string, error)); ok { + return rf(interfaceName) + } + if rf, ok := ret.Get(0).(func(string) string); ok { + r0 = rf(interfaceName) + } else { + r0 = ret.Get(0).(string) + } + + if rf, ok := ret.Get(1).(func(string) string); ok { + r1 = rf(interfaceName) + } else { + r1 = ret.Get(1).(string) + } + + if rf, ok := ret.Get(2).(func(string) error); ok { + r2 = rf(interfaceName) + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} + // IsSriovVF provides a mock function with given fields: pciAddr func (_m *HostUtils) IsSriovVF(pciAddr string) bool { ret := _m.Called(pciAddr) @@ -214,6 +280,142 @@ func (_m *HostUtils) IsSriovVF(pciAddr string) bool { return r0 } +// QueryNvConfig provides a mock function with given fields: ctx, pciAddr +func (_m *HostUtils) QueryNvConfig(ctx context.Context, pciAddr string) (types.NvConfigQuery, error) { + ret := _m.Called(ctx, pciAddr) + + if len(ret) == 0 { + panic("no return value specified for QueryNvConfig") + } + + var r0 types.NvConfigQuery + var r1 error + if rf, ok := ret.Get(0).(func(context.Context, string) (types.NvConfigQuery, error)); ok { + return rf(ctx, pciAddr) + } + if rf, ok := ret.Get(0).(func(context.Context, string) types.NvConfigQuery); ok { + r0 = rf(ctx, pciAddr) + } else { + r0 = ret.Get(0).(types.NvConfigQuery) + } + + if rf, ok := ret.Get(1).(func(context.Context, string) error); ok { + r1 = rf(ctx, pciAddr) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// ResetNicFirmware provides a mock function with given fields: ctx, pciAddr +func (_m *HostUtils) ResetNicFirmware(ctx context.Context, pciAddr string) error { + ret := _m.Called(ctx, pciAddr) + + if len(ret) == 0 { + panic("no return value specified for ResetNicFirmware") + } + + var r0 error + if rf, ok := ret.Get(0).(func(context.Context, string) error); ok { + r0 = rf(ctx, pciAddr) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// ResetNvConfig provides a mock function with given fields: pciAddr +func (_m *HostUtils) ResetNvConfig(pciAddr string) error { + ret := _m.Called(pciAddr) + + if len(ret) == 0 { + panic("no return value specified for ResetNvConfig") + } + + var r0 error + if rf, ok := ret.Get(0).(func(string) error); ok { + r0 = rf(pciAddr) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// ScheduleReboot provides a mock function with given fields: +func (_m *HostUtils) ScheduleReboot() error { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for ScheduleReboot") + } + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// SetMaxReadRequestSize provides a mock function with given fields: pciAddr, maxReadRequestSize +func (_m *HostUtils) SetMaxReadRequestSize(pciAddr string, maxReadRequestSize int) error { + ret := _m.Called(pciAddr, maxReadRequestSize) + + if len(ret) == 0 { + panic("no return value specified for SetMaxReadRequestSize") + } + + var r0 error + if rf, ok := ret.Get(0).(func(string, int) error); ok { + r0 = rf(pciAddr, maxReadRequestSize) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// SetNvConfigParameter provides a mock function with given fields: pciAddr, paramName, paramValue +func (_m *HostUtils) SetNvConfigParameter(pciAddr string, paramName string, paramValue string) error { + ret := _m.Called(pciAddr, paramName, paramValue) + + if len(ret) == 0 { + panic("no return value specified for SetNvConfigParameter") + } + + var r0 error + if rf, ok := ret.Get(0).(func(string, string, string) error); ok { + r0 = rf(pciAddr, paramName, paramValue) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// SetTrustAndPFC provides a mock function with given fields: interfaceName, trust, pfc +func (_m *HostUtils) SetTrustAndPFC(interfaceName string, trust string, pfc string) error { + ret := _m.Called(interfaceName, trust, pfc) + + if len(ret) == 0 { + panic("no return value specified for SetTrustAndPFC") + } + + var r0 error + if rf, ok := ret.Get(0).(func(string, string, string) error); ok { + r0 = rf(interfaceName, trust, pfc) + } else { + r0 = ret.Error(0) + } + + return r0 +} + // NewHostUtils creates a new instance of HostUtils. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. // The first argument is typically a *testing.T value. func NewHostUtils(t interface { diff --git a/pkg/host/mocks/configValidation.go b/pkg/host/mocks/configValidation.go new file mode 100644 index 0000000..1132ac3 --- /dev/null +++ b/pkg/host/mocks/configValidation.go @@ -0,0 +1,174 @@ +// Code generated by mockery v2.45.0. DO NOT EDIT. + +package mocks + +import ( + v1alpha1 "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" + types "github.com/Mellanox/nic-configuration-operator/pkg/types" + mock "github.com/stretchr/testify/mock" +) + +// configValidation is an autogenerated mock type for the configValidation type +type ConfigValidation struct { + mock.Mock +} + +// AdvancedPCISettingsEnabled provides a mock function with given fields: currentConfig +func (_m *ConfigValidation) AdvancedPCISettingsEnabled(currentConfig map[string]string) bool { + ret := _m.Called(currentConfig) + + if len(ret) == 0 { + panic("no return value specified for AdvancedPCISettingsEnabled") + } + + var r0 bool + if rf, ok := ret.Get(0).(func(map[string]string) bool); ok { + r0 = rf(currentConfig) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// CalculateDesiredRuntimeConfig provides a mock function with given fields: device +func (_m *ConfigValidation) CalculateDesiredRuntimeConfig(device *v1alpha1.NicDevice) (int, string, string) { + ret := _m.Called(device) + + if len(ret) == 0 { + panic("no return value specified for CalculateDesiredRuntimeConfig") + } + + var r0 int + var r1 string + var r2 string + if rf, ok := ret.Get(0).(func(*v1alpha1.NicDevice) (int, string, string)); ok { + return rf(device) + } + if rf, ok := ret.Get(0).(func(*v1alpha1.NicDevice) int); ok { + r0 = rf(device) + } else { + r0 = ret.Get(0).(int) + } + + if rf, ok := ret.Get(1).(func(*v1alpha1.NicDevice) string); ok { + r1 = rf(device) + } else { + r1 = ret.Get(1).(string) + } + + if rf, ok := ret.Get(2).(func(*v1alpha1.NicDevice) string); ok { + r2 = rf(device) + } else { + r2 = ret.Get(2).(string) + } + + return r0, r1, r2 +} + +// ConstructNvParamMapFromTemplate provides a mock function with given fields: device, defaultValues +func (_m *ConfigValidation) ConstructNvParamMapFromTemplate(device *v1alpha1.NicDevice, defaultValues map[string]string) (map[string]string, error) { + ret := _m.Called(device, defaultValues) + + if len(ret) == 0 { + panic("no return value specified for ConstructNvParamMapFromTemplate") + } + + var r0 map[string]string + var r1 error + if rf, ok := ret.Get(0).(func(*v1alpha1.NicDevice, map[string]string) (map[string]string, error)); ok { + return rf(device, defaultValues) + } + if rf, ok := ret.Get(0).(func(*v1alpha1.NicDevice, map[string]string) map[string]string); ok { + r0 = rf(device, defaultValues) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(map[string]string) + } + } + + if rf, ok := ret.Get(1).(func(*v1alpha1.NicDevice, map[string]string) error); ok { + r1 = rf(device, defaultValues) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// RuntimeConfigApplied provides a mock function with given fields: device +func (_m *ConfigValidation) RuntimeConfigApplied(device *v1alpha1.NicDevice) (bool, error) { + ret := _m.Called(device) + + if len(ret) == 0 { + panic("no return value specified for RuntimeConfigApplied") + } + + var r0 bool + var r1 error + if rf, ok := ret.Get(0).(func(*v1alpha1.NicDevice) (bool, error)); ok { + return rf(device) + } + if rf, ok := ret.Get(0).(func(*v1alpha1.NicDevice) bool); ok { + r0 = rf(device) + } else { + r0 = ret.Get(0).(bool) + } + + if rf, ok := ret.Get(1).(func(*v1alpha1.NicDevice) error); ok { + r1 = rf(device) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// ValidateResetToDefault provides a mock function with given fields: nvConfig +func (_m *ConfigValidation) ValidateResetToDefault(nvConfig types.NvConfigQuery) (bool, bool, error) { + ret := _m.Called(nvConfig) + + if len(ret) == 0 { + panic("no return value specified for ValidateResetToDefault") + } + + var r0 bool + var r1 bool + var r2 error + if rf, ok := ret.Get(0).(func(types.NvConfigQuery) (bool, bool, error)); ok { + return rf(nvConfig) + } + if rf, ok := ret.Get(0).(func(types.NvConfigQuery) bool); ok { + r0 = rf(nvConfig) + } else { + r0 = ret.Get(0).(bool) + } + + if rf, ok := ret.Get(1).(func(types.NvConfigQuery) bool); ok { + r1 = rf(nvConfig) + } else { + r1 = ret.Get(1).(bool) + } + + if rf, ok := ret.Get(2).(func(types.NvConfigQuery) error); ok { + r2 = rf(nvConfig) + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} + +// newConfigValidation creates a new instance of configValidation. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func newConfigValidation(t interface { + mock.TestingT + Cleanup(func()) +}) *ConfigValidation { + mock := &ConfigValidation{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/pkg/host/utils.go b/pkg/host/utils.go index 2a09cea..887e72d 100644 --- a/pkg/host/utils.go +++ b/pkg/host/utils.go @@ -17,10 +17,14 @@ package host import ( "bufio" + "context" "fmt" "os" "path/filepath" + "regexp" + "strconv" "strings" + "syscall" "github.com/Mellanox/rdmamap" "github.com/jaypipes/ghw" @@ -30,9 +34,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "github.com/Mellanox/nic-configuration-operator/pkg/consts" + "github.com/Mellanox/nic-configuration-operator/pkg/types" ) const pciDevicesPath = "/sys/bus/pci/devices" +const arrayPrefix = "Array" // HostUtils is an interface that contains util functions that perform operations on the actual host type HostUtils interface { @@ -42,6 +48,12 @@ type HostUtils interface { GetPartAndSerialNumber(pciAddr string) (string, string, error) // GetFirmwareVersionAndPSID uses mstflint tool to retrieve FW version and PSID of the device GetFirmwareVersionAndPSID(pciAddr string) (string, string, error) + // GetPCILinkSpeed return PCI bus speed in GT/s + GetPCILinkSpeed(pciAddr string) (int, error) + // GetMaxReadRequestSize returns MaxReadRequest size for PCI device + GetMaxReadRequestSize(pciAddr string) (int, error) + // GetTrustAndPFC returns trust and pfc settings for network interface + GetTrustAndPFC(interfaceName string) (string, string, error) // GetRDMADeviceName returns a RDMA device name for the given PCI address GetRDMADeviceName(pciAddr string) string // GetInterfaceName returns a network interface name for the given PCI address @@ -50,6 +62,22 @@ type HostUtils interface { GetLinkType(name string) string // IsSriovVF return true if the device is a SRIOV VF, false otherwise IsSriovVF(pciAddr string) bool + // QueryNvConfig queries nv config for a mellanox device and returns default, current and next boot configs + QueryNvConfig(ctx context.Context, pciAddr string) (types.NvConfigQuery, error) + // SetNvConfigParameter sets a nv config parameter for a mellanox device + SetNvConfigParameter(pciAddr string, paramName string, paramValue string) error + // ResetNvConfig resets NIC's nv config + ResetNvConfig(pciAddr string) error + // ResetNicFirmware resets NIC's firmware + // Operation can be long, required context to be able to terminate by timeout + // IB devices need to communicate with other nodes for confirmation + ResetNicFirmware(ctx context.Context, pciAddr string) error + // SetMaxReadRequestSize sets max read request size for PCI device + SetMaxReadRequestSize(pciAddr string, maxReadRequestSize int) error + // SetTrustAndPFC sets trust and PFC settings for a network interface + SetTrustAndPFC(interfaceName string, trust string, pfc string) error + // ScheduleReboot schedules reboot on the host + ScheduleReboot() error } type hostUtils struct { @@ -141,6 +169,128 @@ func (h *hostUtils) GetFirmwareVersionAndPSID(pciAddr string) (string, string, e return firmwareVersion, PSID, nil } +// GetPCILinkSpeed return PCI bus speed in GT/s +func (h *hostUtils) GetPCILinkSpeed(pciAddr string) (int, error) { + log.Log.Info("HostUtils.GetPCILinkSpeed()", "pciAddr", pciAddr) + cmd := h.execInterface.Command("lspci", "-vv", "-s", pciAddr) + output, err := cmd.Output() + if err != nil { + log.Log.Error(err, "GetPCILinkSpeed(): Failed to run lspci") + return -1, err + } + + linkSpeedRegexp := regexp.MustCompile(`speed\s+([0-9.]+)gt/s`) + + // Parse the output for LnkSta + scanner := bufio.NewScanner(strings.NewReader(string(output))) + + for scanner.Scan() { + line := strings.TrimSpace(strings.ToLower(scanner.Text())) + + if !strings.HasPrefix(line, consts.LinkStatsPrefix) { + continue + } + + match := linkSpeedRegexp.FindStringSubmatch(line) + if len(match) == 2 { + speedValue, err := strconv.Atoi(match[1]) + if err != nil { + log.Log.Error(err, "failed to parse link speed value", "pciAddr", pciAddr) + return -1, err + } + + return speedValue, nil + } + } + + if err := scanner.Err(); err != nil { + log.Log.Error(err, "GetPCILinkSpeed(): Error reading lspci output") + return -1, err + } + + return -1, nil +} + +// GetMaxReadRequestSize returns MaxReadRequest size for PCI device +func (h *hostUtils) GetMaxReadRequestSize(pciAddr string) (int, error) { + log.Log.Info("HostUtils.GetMaxReadRequestSize()", "pciAddr", pciAddr) + cmd := h.execInterface.Command("lspci", "-vv", "-s", pciAddr) + output, err := cmd.Output() + if err != nil && len(output) == 0 { + log.Log.Error(err, "GetMaxReadRequestSize(): Failed to run lspci") + return -1, err + } + + maxReadReqRegexp := regexp.MustCompile(consts.MaxReadReqPrefix + `\s+(\d+)\s+bytes`) + + // Parse the output for LnkSta + scanner := bufio.NewScanner(strings.NewReader(string(output))) + + for scanner.Scan() { + line := strings.TrimSpace(strings.ToLower(scanner.Text())) + + if strings.Contains(line, consts.MaxReadReqPrefix) { + match := maxReadReqRegexp.FindStringSubmatch(line) + if len(match) != 2 { + continue + } + + maxReqReqSize, err := strconv.Atoi(match[1]) + if err != nil { + log.Log.Error(err, "failed to parse max read req size", "pciAddr", pciAddr) + return -1, err + } + + return maxReqReqSize, nil + } + } + + if err := scanner.Err(); err != nil { + log.Log.Error(err, "GetMaxReadRequestSize(): Error reading lspci output") + return -1, err + } + + return -1, nil +} + +// GetTrustAndPFC returns trust and pfc settings for network interface +func (h *hostUtils) GetTrustAndPFC(interfaceName string) (string, string, error) { + log.Log.Info("HostUtils.GetTrustAndPFC()", "interface", interfaceName) + cmd := h.execInterface.Command("mlnx_qos", "-i", interfaceName) + output, err := cmd.CombinedOutput() + if err != nil { + err = fmt.Errorf("failed to run mlnx_qos: %s", output) + log.Log.Error(err, "GetTrustAndPFC(): Failed to run mlnx_qos") + return "", "", err + } + + // Parse the output for trust and pfc states + scanner := bufio.NewScanner(strings.NewReader(string(output))) + var trust, pfc string + + for scanner.Scan() { + line := strings.TrimSpace(strings.ToLower(scanner.Text())) + + if strings.HasPrefix(line, consts.TrustStatePrefix) { + trust = strings.TrimSpace(strings.TrimPrefix(line, consts.TrustStatePrefix)) + } + if strings.HasPrefix(line, consts.PfcEnabledPrefix) { + pfc = strings.Replace(strings.TrimSpace(strings.TrimPrefix(line, consts.PfcEnabledPrefix)), " ", ",", -1) + } + } + + if err := scanner.Err(); err != nil { + log.Log.Error(err, "GetTrustAndPFC(): Error reading mlnx_qos output") + return "", "", err + } + + if trust == "" || pfc == "" { + return "", "", fmt.Errorf("GetTrustAndPFC(): trust (%v) or pfc (%v) is empty", trust, pfc) + } + + return trust, pfc, nil +} + // GetLinkType return the link type of the net device (Ethernet / Infiniband) func (h *hostUtils) GetLinkType(name string) string { log.Log.Info("HostUtils.GetLinkType()", "name", name) @@ -162,7 +312,6 @@ func encapTypeToLinkType(encapType string) string { } func getNetNames(pciAddr string) ([]string, error) { - var names []string netDir := filepath.Join(pciDevicesPath, pciAddr, "net") if _, err := os.Lstat(netDir); err != nil { return nil, fmt.Errorf("GetNetNames(): no net directory under pci device %s: %q", pciAddr, err) @@ -173,7 +322,7 @@ func getNetNames(pciAddr string) ([]string, error) { return nil, fmt.Errorf("GetNetNames(): failed to read net directory %s: %q", netDir, err) } - names = make([]string, 0) + names := make([]string, 0) for _, f := range fInfos { names = append(names, f.Name()) } @@ -220,6 +369,242 @@ func (h *hostUtils) GetRDMADeviceName(pciAddr string) string { return rdmaDevices[0] } +// queryMSTConfig runs a query on mstconfig to parse out default, current and nextboot configurations +// might run recursively to expand array parameters' values +func (h *hostUtils) queryMSTConfig(ctx context.Context, query types.NvConfigQuery, pciAddr string, additionalParameter string) error { + log.Log.Info(fmt.Sprintf("mstconfig -d %s query %s", pciAddr, additionalParameter)) //TODO change verbosity + valueInBracketsRegex := regexp.MustCompile(`\(([^)]*)\)$`) + + var cmd execUtils.Cmd + if additionalParameter == "" { + cmd = h.execInterface.CommandContext(ctx, "mstconfig", "-d", pciAddr, "-e", "query") + } else { + cmd = h.execInterface.CommandContext(ctx, "mstconfig", "-d", pciAddr, "-e", "query", additionalParameter) + } + output, err := cmd.Output() + if err != nil { + log.Log.Error(err, "queryMSTConfig(): Failed to run mstconfig", "output", string(output)) + return err + } + + inConfigSection := false + scanner := bufio.NewScanner(strings.NewReader(string(output))) + for scanner.Scan() { + line := scanner.Text() + + // Trim leading and trailing whitespace + line = strings.TrimSpace(line) + + // Skip empty lines + if line == "" { + continue + } + // Check for the start of the configurations section + if strings.HasPrefix(line, "Configurations:") { + inConfigSection = true + continue + } + + // If in configurations section, parse the additionalParameters + if inConfigSection { + // Check if we have reached the end of the configurations section + // In this example, we'll assume the configurations end when the scanner reaches EOF + // Alternatively, you can check for specific markers or conditions + + if strings.HasPrefix(line, "*") { + line = strings.TrimPrefix(line, "*") + line = strings.TrimSpace(line) + } + + // Replace multiple spaces with a single tab character + spaceRe := regexp.MustCompile(`\s{2,}`) + line = spaceRe.ReplaceAllString(line, "\t") + + fields := strings.Split(line, "\t") + if len(fields) != 4 { + // Line does not contain additionalParameters and values, skipping + continue + } + + for i := range fields { + fields[i] = strings.TrimSpace(fields[i]) + } + + paramName := fields[0] + defaultVal := fields[1] + currentVal := fields[2] + nextBootVal := fields[3] + + // If the parameter value is an array, we want to extract values for all indices + if strings.HasPrefix(defaultVal, arrayPrefix) { + err = h.queryMSTConfig(ctx, query, pciAddr, paramName+strings.TrimPrefix(defaultVal, arrayPrefix)) + if err != nil { + return err + } + continue + } + + // If the actual value is wrapped in brackets, we want to extract it + match := valueInBracketsRegex.FindStringSubmatch(defaultVal) + if len(match) == 2 { + defaultVal = match[1] + + match = valueInBracketsRegex.FindStringSubmatch(currentVal) + currentVal = match[1] + + match = valueInBracketsRegex.FindStringSubmatch(nextBootVal) + nextBootVal = match[1] + } + + query.DefaultConfig[paramName] = defaultVal + query.CurrentConfig[paramName] = currentVal + query.NextBootConfig[paramName] = nextBootVal + } + } + + return nil +} + +// QueryNvConfig queries nv config for a mellanox device and returns default, current and next boot configs +func (h *hostUtils) QueryNvConfig(ctx context.Context, pciAddr string) (types.NvConfigQuery, error) { + log.Log.Info("HostUtils.QueryNvConfig()", "pciAddr", pciAddr) + + query := types.NewNvConfigQuery() + + err := h.queryMSTConfig(ctx, query, pciAddr, "") + if err != nil { + log.Log.Error(err, "Failed to parse mstconfig query output", "device", pciAddr) + } + + return query, err +} + +// SetNvConfigParameter sets a nv config parameter for a mellanox device +func (h *hostUtils) SetNvConfigParameter(pciAddr string, paramName string, paramValue string) error { + log.Log.Info("HostUtils.SetNvConfigParameter()", "pciAddr", pciAddr, "paramName", paramName, "paramValue", paramValue) + + cmd := h.execInterface.Command("mstconfig", "-d", pciAddr, "--yes", "set", paramName+"="+paramValue) + _, err := cmd.Output() + if err != nil { + log.Log.Error(err, "SetNvConfigParameter(): Failed to run mstconfig") + return err + } + return nil +} + +// ResetNvConfig resets NIC's nv config +func (h *hostUtils) ResetNvConfig(pciAddr string) error { + log.Log.Info("HostUtils.ResetNvConfig()", "pciAddr", pciAddr) + + cmd := h.execInterface.Command("mstconfig", "-d", pciAddr, "--yes", "reset") + _, err := cmd.Output() + if err != nil { + log.Log.Error(err, "ResetNvConfig(): Failed to run mstconfig") + return err + } + return nil +} + +// ResetNicFirmware resets NIC's firmware +// Operation can be long, required context to be able to terminate by timeout +// IB devices need to communicate with other nodes for confirmation +func (h *hostUtils) ResetNicFirmware(ctx context.Context, pciAddr string) error { + log.Log.Info("HostUtils.ResetNicFirmware()", "pciAddr", pciAddr) + + cmd := h.execInterface.CommandContext(ctx, "mlxfwreset", "--device", pciAddr, "reset", "--yes") + _, err := cmd.Output() + if err != nil { + log.Log.Error(err, "ResetNicFirmware(): Failed to run mlxfwreset") + return err + } + return nil +} + +// SetMaxReadRequestSize sets max read request size for PCI device +func (h *hostUtils) SetMaxReadRequestSize(pciAddr string, maxReadRequestSize int) error { + log.Log.Info("HostUtils.SetMaxReadRequestSize()", "pciAddr", pciAddr, "maxReadRequestSize", maxReadRequestSize) + + // Meaning of the value is explained here: + // https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance#PCIe-Max-Read-Request + readReqSizeToIndex := map[int]int{ + 128: 0, + 256: 1, + 512: 2, + 1024: 3, + 2048: 4, + 4096: 5, + } + + valueToApply, found := readReqSizeToIndex[maxReadRequestSize] + if !found { + err := fmt.Errorf("unsupported maxReadRequestSize (%d) for pci device (%s). Acceptable values are powers of 2 from 128 to 4096", maxReadRequestSize, pciAddr) + log.Log.Error(err, "failed to set maxReadRequestSize", "pciAddr", pciAddr, "maxReadRequestSize", maxReadRequestSize) + return err + } + + cmd := h.execInterface.Command("setpci", "-s", pciAddr, fmt.Sprintf("CAP_EXP+08.w=%d000:FFFF", valueToApply)) + _, err := cmd.Output() + if err != nil { + log.Log.Error(err, "SetMaxReadRequestSize(): Failed to run setpci") + return err + } + return nil +} + +// SetTrustAndPFC sets trust and PFC settings for a network interface +func (h *hostUtils) SetTrustAndPFC(interfaceName string, trust string, pfc string) error { + log.Log.Info("HostUtils.SetTrustAndPFC()", "interfaceName", interfaceName, "trust", trust, "pfc", pfc) + + cmd := h.execInterface.Command("mlnx_qos", "-i", interfaceName, "--trust", trust, "--pfc", pfc) + output, err := cmd.CombinedOutput() + if err != nil { + err = fmt.Errorf("failed to run mlnx_qos: %s", output) + log.Log.Error(err, "SetTrustAndPFC(): Failed to run mlnx_qos") + return err + } + return nil +} + +func (h *hostUtils) ScheduleReboot() error { + log.Log.Info("HostUtils.ScheduleReboot()") + root, err := os.Open("/") + if err != nil { + log.Log.Error(err, "ScheduleReboot(): Failed to os.Open") + return err + } + + if err := syscall.Chroot(consts.HostPath); err != nil { + err := root.Close() + if err != nil { + log.Log.Error(err, "ScheduleReboot(): Failed to syscall.Chroot") + return err + } + return err + } + + defer func() { + if err := root.Close(); err != nil { + log.Log.Error(err, "ScheduleReboot(): Failed to os.Close") + return + } + if err := root.Chdir(); err != nil { + log.Log.Error(err, "ScheduleReboot(): Failed to os.Chdir") + return + } + if err = syscall.Chroot("."); err != nil { + log.Log.Error(err, "ScheduleReboot(): Failed to syscall.Chroot") + } + }() + + cmd := h.execInterface.Command("shutdown", "-r", "now") + _, err = cmd.Output() + if err != nil { + log.Log.Error(err, "ScheduleReboot(): Failed to run shutdown -r now") + return err + } + return nil +} + func NewHostUtils() HostUtils { return &hostUtils{execInterface: execUtils.New()} } diff --git a/pkg/host/utils_test.go b/pkg/host/utils_test.go index 30532f3..0e3333b 100644 --- a/pkg/host/utils_test.go +++ b/pkg/host/utils_test.go @@ -16,6 +16,9 @@ limitations under the License. package host import ( + "context" + "errors" + "fmt" "strings" . "github.com/onsi/ginkgo/v2" @@ -24,14 +27,10 @@ import ( execTesting "k8s.io/utils/exec/testing" ) -var _ = Describe("HostUtils", func() { - var ( - pciAddress = "0000:3b:00.0" - ) - - BeforeEach(func() { - }) +const pciAddress = "0000:03:00.0" +var _ = Describe("HostUtils", func() { + //nolint:dupl Describe("GetPartAndSerialNumber", func() { It("should return lowercased part and serial numbers", func() { partNumber := "partNumber" @@ -106,6 +105,7 @@ var _ = Describe("HostUtils", func() { Expect(serial).To(Equal("")) }) }) + //nolint:dupl Describe("GetFirmwareVersionAndPSID", func() { It("should return lowercased firmware version and psid", func() { fwVersion := "VeRsIoN" @@ -180,4 +180,505 @@ var _ = Describe("HostUtils", func() { Expect(serial).To(Equal("")) }) }) + Describe("GetPCILinkSpeed", func() { + var ( + h *hostUtils + fakeExec *execTesting.FakeExec + pciAddr string + ) + + BeforeEach(func() { + fakeExec = &execTesting.FakeExec{} + + h = &hostUtils{ + execInterface: fakeExec, + } + }) + + It("should return the correct PCI link speed when lspci output is valid", func() { + lspciOutput := ` +03:00.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5] + Subsystem: Mellanox Technologies Device 0000 + ... + LnkSta: Speed 8GT/s (ok), Width x8 (ok) +` + + fakeCmd := &execTesting.FakeCmd{} + fakeCmd.OutputScript = append(fakeCmd.OutputScript, + func() ([]byte, []byte, error) { + return []byte(lspciOutput), nil, nil + }, + ) + + fakeExec.CommandScript = []execTesting.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("lspci")) + Expect(args).To(Equal([]string{"-vv", "-s", pciAddr})) + return fakeCmd + }, + } + + speed, err := h.GetPCILinkSpeed(pciAddr) + Expect(err).ToNot(HaveOccurred()) + Expect(speed).To(Equal(8)) + }) + + It("should return an error when the speed value is not an integer", func() { + // Mock lspci output with a decimal speed + lspciOutput := ` +03:00.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5] + ... + LnkSta: Speed 2.5GT/s (ok), Width x8 (ok) +` + + fakeCmd := &execTesting.FakeCmd{} + fakeCmd.OutputScript = append(fakeCmd.OutputScript, + func() ([]byte, []byte, error) { + return []byte(lspciOutput), nil, nil + }, + ) + + fakeExec.CommandScript = []execTesting.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("lspci")) + Expect(args).To(Equal([]string{"-vv", "-s", pciAddr})) + return fakeCmd + }, + } + + speed, err := h.GetPCILinkSpeed(pciAddr) + Expect(err).To(HaveOccurred()) + Expect(speed).To(Equal(-1)) + }) + + It("should return -1 when the lspci output does not contain LnkSta line", func() { + lspciOutput := ` +03:00.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5] + Subsystem: Mellanox Technologies Device 0000 + ... +` + + fakeCmd := &execTesting.FakeCmd{} + fakeCmd.OutputScript = append(fakeCmd.OutputScript, + func() ([]byte, []byte, error) { + return []byte(lspciOutput), nil, nil + }, + ) + + fakeExec.CommandScript = []execTesting.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("lspci")) + Expect(args).To(Equal([]string{"-vv", "-s", pciAddr})) + return fakeCmd + }, + } + + speed, err := h.GetPCILinkSpeed(pciAddr) + Expect(err).ToNot(HaveOccurred()) + Expect(speed).To(Equal(-1)) + }) + + It("should return an error when lspci command fails", func() { + // Mock the command to fail + fakeCmd := &execTesting.FakeCmd{} + fakeCmd.OutputScript = append(fakeCmd.OutputScript, + func() ([]byte, []byte, error) { + return nil, nil, fmt.Errorf("lspci command failed") + }, + ) + + fakeExec.CommandScript = []execTesting.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("lspci")) + Expect(args).To(Equal([]string{"-vv", "-s", pciAddr})) + return fakeCmd + }, + } + + speed, err := h.GetPCILinkSpeed(pciAddr) + Expect(err).To(HaveOccurred()) + Expect(speed).To(Equal(-1)) + }) + }) + Describe("queryMSTConfig", func() { + var ( + h *hostUtils + fakeExec *execTesting.FakeExec + ) + + BeforeEach(func() { + fakeExec = &execTesting.FakeExec{} + + // Prepare the fake commands and their outputs + // First command: mstconfig -d 0000:03:00.0 q + cmd1 := &execTesting.FakeCmd{} + cmd1.OutputScript = append(cmd1.OutputScript, + func() ([]byte, []byte, error) { + output := ` +Device #1: +---------- + +Device type: ConnectX4 +Configurations: Default Current Next Boot +* KEEP_IB_LINK_UP_P2 False(0) True(1) True(1) + KEEP_LINK_UP_ON_BOOT_P2 False(0) False(0) False(0) + ESWITCH_HAIRPIN_DESCRIPTORS Array[0..7] Array[0..7] Array[0..7] +* MEMIC_SIZE_LIMIT _256KB(1) _256KB(1) _256KB(0) +The '*' shows parameters with next value different from default/current value. +` + return []byte(output), nil, nil + }, + ) + + // Second command: mstconfig -d 0000:03:00.0 q ESWITCH_HAIRPIN_DESCRIPTORS[0..7] + cmd2 := &execTesting.FakeCmd{} + cmd2.OutputScript = append(cmd2.OutputScript, + func() ([]byte, []byte, error) { + output := ` +Configurations: Default Current Next Boot + ESWITCH_HAIRPIN_DESCRIPTORS[0] 128 128 128 + ESWITCH_HAIRPIN_DESCRIPTORS[1] 128 128 128 + ESWITCH_HAIRPIN_DESCRIPTORS[2] 128 128 128 + ESWITCH_HAIRPIN_DESCRIPTORS[3] 128 128 128 + ESWITCH_HAIRPIN_DESCRIPTORS[4] 128 128 128 + ESWITCH_HAIRPIN_DESCRIPTORS[5] 128 128 128 + ESWITCH_HAIRPIN_DESCRIPTORS[6] 128 128 128 + ESWITCH_HAIRPIN_DESCRIPTORS[7] 128 128 128 +` + return []byte(output), nil, nil + }, + ) + + fakeExec.CommandScript = []execTesting.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("mstconfig")) + Expect(args).To(Equal([]string{"-d", pciAddress, "-e", "query"})) + return cmd1 + }, + func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("mstconfig")) + Expect(args).To(Equal([]string{"-d", pciAddress, "-e", "query", "ESWITCH_HAIRPIN_DESCRIPTORS[0..7]"})) + return cmd2 + }, + } + + h = &hostUtils{ + execInterface: fakeExec, + } + }) + + It("should parse mstconfig output correctly", func() { + query, err := h.QueryNvConfig(context.TODO(), pciAddress) + Expect(err).ToNot(HaveOccurred()) + + // Verify regular parameters + Expect(query.DefaultConfig).To(HaveKeyWithValue("KEEP_IB_LINK_UP_P2", "0")) + Expect(query.CurrentConfig).To(HaveKeyWithValue("KEEP_IB_LINK_UP_P2", "1")) + Expect(query.NextBootConfig).To(HaveKeyWithValue("KEEP_IB_LINK_UP_P2", "1")) + + Expect(query.DefaultConfig).To(HaveKeyWithValue("MEMIC_SIZE_LIMIT", "1")) + Expect(query.CurrentConfig).To(HaveKeyWithValue("MEMIC_SIZE_LIMIT", "1")) + Expect(query.NextBootConfig).To(HaveKeyWithValue("MEMIC_SIZE_LIMIT", "0")) + + Expect(query.DefaultConfig).To(HaveKeyWithValue("KEEP_LINK_UP_ON_BOOT_P2", "0")) + Expect(query.CurrentConfig).To(HaveKeyWithValue("KEEP_LINK_UP_ON_BOOT_P2", "0")) + Expect(query.NextBootConfig).To(HaveKeyWithValue("KEEP_LINK_UP_ON_BOOT_P2", "0")) + + // Verify array parameters + for i := 0; i <= 7; i++ { + key := fmt.Sprintf("ESWITCH_HAIRPIN_DESCRIPTORS[%d]", i) + Expect(query.DefaultConfig).To(HaveKeyWithValue(key, "128")) + Expect(query.CurrentConfig).To(HaveKeyWithValue(key, "128")) + Expect(query.NextBootConfig).To(HaveKeyWithValue(key, "128")) + } + }) + + It("should handle missing configurations section gracefully", func() { + cmd1 := &execTesting.FakeCmd{} + cmd1.OutputScript = append(cmd1.OutputScript, + func() ([]byte, []byte, error) { + output := ` +Device #1: +---------- + +Device type: ConnectX4 +` + return []byte(output), nil, nil + }, + ) + + fakeExec.CommandScript = []execTesting.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("mstconfig")) + Expect(args).To(Equal([]string{"-d", pciAddress, "-e", "query"})) + return cmd1 + }, + } + + h.execInterface = fakeExec + + query, err := h.QueryNvConfig(context.TODO(), pciAddress) + Expect(err).ToNot(HaveOccurred()) + + Expect(query.DefaultConfig).To(BeEmpty()) + Expect(query.CurrentConfig).To(BeEmpty()) + Expect(query.NextBootConfig).To(BeEmpty()) + }) + + It("should return error if mstconfig command fails", func() { + // Set the command to return an error + cmd1 := &execTesting.FakeCmd{} + cmd1.OutputScript = append(cmd1.OutputScript, + func() ([]byte, []byte, error) { + return nil, nil, fmt.Errorf("mstconfig error") + }, + ) + + fakeExec.CommandScript = []execTesting.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("mstconfig")) + Expect(args).To(Equal([]string{"-d", pciAddress, "-e", "query"})) + return cmd1 + }, + } + + h.execInterface = fakeExec + + _, err := h.QueryNvConfig(context.TODO(), pciAddress) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(Equal("mstconfig error")) + }) + }) + Describe("GetMaxReadRequestSize", func() { + var ( + h *hostUtils + fakeExec *execTesting.FakeExec + pciAddr string + ) + + BeforeEach(func() { + fakeExec = &execTesting.FakeExec{} + + h = &hostUtils{ + execInterface: fakeExec, + } + }) + + It("should return the correct PCI link speed when lspci output is valid", func() { + lspciOutput := ` +03:00.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5] + Subsystem: Mellanox Technologies Device 0000 + ... + DevCtl: CorrErr- NonFatalErr- FatalErr- UnsupReq- + RlxdOrd- ExtTag- PhantFunc- AuxPwr- NoSnoop- FLReset- + MaxPayload 128 bytes, MaxReadReq 256 bytes + DevSta: CorrErr+ NonFatalErr- FatalErr+ UnsupReq- AuxPwr- TransPend- +` + + fakeCmd := &execTesting.FakeCmd{} + fakeCmd.OutputScript = append(fakeCmd.OutputScript, + func() ([]byte, []byte, error) { + return []byte(lspciOutput), nil, nil + }, + ) + + fakeExec.CommandScript = []execTesting.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("lspci")) + Expect(args).To(Equal([]string{"-vv", "-s", pciAddr})) + return fakeCmd + }, + } + + maxReadRequestSize, err := h.GetMaxReadRequestSize(pciAddr) + Expect(err).ToNot(HaveOccurred()) + Expect(maxReadRequestSize).To(Equal(256)) + }) + + It("should return -1 when the lspci output does not contain MaxReadReq line", func() { + lspciOutput := ` +03:00.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5] + Subsystem: Mellanox Technologies Device 0000 + ... +` + + fakeCmd := &execTesting.FakeCmd{} + fakeCmd.OutputScript = append(fakeCmd.OutputScript, + func() ([]byte, []byte, error) { + return []byte(lspciOutput), nil, nil + }, + ) + + fakeExec.CommandScript = []execTesting.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("lspci")) + Expect(args).To(Equal([]string{"-vv", "-s", pciAddr})) + return fakeCmd + }, + } + + maxReadRequestSize, err := h.GetMaxReadRequestSize(pciAddr) + Expect(err).ToNot(HaveOccurred()) + Expect(maxReadRequestSize).To(Equal(-1)) + }) + + It("should return an error when lspci command fails", func() { + // Mock the command to fail + fakeCmd := &execTesting.FakeCmd{} + fakeCmd.OutputScript = append(fakeCmd.OutputScript, + func() ([]byte, []byte, error) { + return nil, nil, fmt.Errorf("lspci command failed") + }, + ) + + fakeExec.CommandScript = []execTesting.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("lspci")) + Expect(args).To(Equal([]string{"-vv", "-s", pciAddr})) + return fakeCmd + }, + } + + maxReadRequestSize, err := h.GetMaxReadRequestSize(pciAddr) + Expect(err).To(HaveOccurred()) + Expect(maxReadRequestSize).To(Equal(-1)) + }) + }) + Describe("GetTrustAndPFC", func() { + It("should return parsed values", func() { + interfaceName := "enp3s0f0np0" + trust := "dscp" + pfc := "0,1,0,1,0,1,0,0" + + fakeExec := &execTesting.FakeExec{} + + fakeCmd := &execTesting.FakeCmd{} + fakeCmd.CombinedOutputScript = append(fakeCmd.CombinedOutputScript, func() ([]byte, []byte, error) { + return []byte("irrelevant line\n" + + "Priority trust state: dscp\n" + + " enabled 0 1 0 1 0 1 0 0 \n" + + "another irrelevant line"), + nil, nil + }) + + fakeExec.CommandScript = append(fakeExec.CommandScript, func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("mlnx_qos")) + Expect(args).To(Equal([]string{"-i", interfaceName})) + return fakeCmd + }) + + h := &hostUtils{ + execInterface: fakeExec, + } + + observedTrust, observedPFC, err := h.GetTrustAndPFC(interfaceName) + + Expect(err).NotTo(HaveOccurred()) + Expect(observedTrust).To(Equal(strings.ToLower(trust))) + Expect(observedPFC).To(Equal(strings.ToLower(pfc))) + }) + It("should return empty string for both numbers if one is empty", func() { + interfaceName := "enp3s0f0np0" + + fakeExec := &execTesting.FakeExec{} + + fakeCmd := &execTesting.FakeCmd{} + fakeCmd.CombinedOutputScript = append(fakeCmd.CombinedOutputScript, func() ([]byte, []byte, error) { + return []byte("irrelevant line\n" + + "Priority trust state: dscp\n" + + "another irrelevant line"), + nil, nil + }) + + fakeExec.CommandScript = append(fakeExec.CommandScript, func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("mlnx_qos")) + Expect(args).To(Equal([]string{"-i", interfaceName})) + return fakeCmd + }) + + h := &hostUtils{ + execInterface: fakeExec, + } + + observedTrust, observedPFC, err := h.GetTrustAndPFC(interfaceName) + + Expect(err).To(HaveOccurred()) + Expect(observedTrust).To(Equal("")) + Expect(observedPFC).To(Equal("")) + }) + }) + Describe("SetMaxReadRequestSize", func() { + var ( + h *hostUtils + fakeExec *execTesting.FakeExec + pciAddr string + ) + + BeforeEach(func() { + fakeExec = &execTesting.FakeExec{} + + h = &hostUtils{ + execInterface: fakeExec, + } + }) + + Context("with a valid maxReadRequestSize", func() { + It("executes the setpci command successfully", func() { + maxSize := 256 + valueToApply := 1 + + fakeCmd := &execTesting.FakeCmd{} + fakeCmd.OutputScript = append(fakeCmd.OutputScript, func() ([]byte, []byte, error) { + return nil, nil, nil + }) + + fakeExec.CommandScript = append(fakeExec.CommandScript, func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("setpci")) + Expect(args).To(Equal([]string{"-s", pciAddr, fmt.Sprintf("CAP_EXP+08.w=%d000:FFFF", valueToApply)})) + return fakeCmd + }) + + err := h.SetMaxReadRequestSize(pciAddr, maxSize) + + Expect(err).ToNot(HaveOccurred()) + Expect(fakeExec.CommandCalls).To(Equal(1)) + }) + }) + + Context("with an invalid maxReadRequestSize", func() { + It("returns an error without executing any command", func() { + maxSize := 300 + + err := h.SetMaxReadRequestSize(pciAddr, maxSize) + + Expect(err).To(MatchError(fmt.Sprintf( + "unsupported maxReadRequestSize (%d) for pci device (%s). Acceptable values are powers of 2 from 128 to 4096", + maxSize, pciAddr, + ))) + }) + }) + + Context("when the setpci command fails", func() { + It("returns an error", func() { + maxSize := 4096 + valueToApply := 5 + + fakeCmd := &execTesting.FakeCmd{} + fakeCmd.OutputScript = append(fakeCmd.OutputScript, func() ([]byte, []byte, error) { + return nil, nil, errors.New("some error") + }) + + fakeExec.CommandScript = append(fakeExec.CommandScript, func(cmd string, args ...string) exec.Cmd { + Expect(cmd).To(Equal("setpci")) + Expect(args).To(Equal([]string{"-s", pciAddr, fmt.Sprintf("CAP_EXP+08.w=%d000:FFFF", valueToApply)})) + return fakeCmd + }) + + err := h.SetMaxReadRequestSize(pciAddr, maxSize) + + Expect(err).To(HaveOccurred()) + Expect(fakeExec.CommandCalls).To(Equal(1)) + }) + }) + }) }) diff --git a/pkg/maintenance/maintenancemanager.go b/pkg/maintenance/maintenancemanager.go new file mode 100644 index 0000000..0aeb66e --- /dev/null +++ b/pkg/maintenance/maintenancemanager.go @@ -0,0 +1,158 @@ +/* +2024 NVIDIA CORPORATION & AFFILIATES +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package maintenance + +import ( + "context" + + maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1" + "github.com/Mellanox/nic-configuration-operator/pkg/host" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + "github.com/Mellanox/nic-configuration-operator/pkg/consts" +) + +type MaintenanceManager interface { + ScheduleMaintenance(ctx context.Context) error + MaintenanceAllowed(ctx context.Context) (bool, error) + ReleaseMaintenance(ctx context.Context) error + Reboot() error +} + +type maintenanceManager struct { + client client.Client + hostUtils host.HostUtils + nodeName string + namespace string +} + +func (m maintenanceManager) getNodeMaintenanceObject(ctx context.Context) (*maintenanceoperator.NodeMaintenance, error) { + list := maintenanceoperator.NodeMaintenanceList{} + err := m.client.List(ctx, &list, client.InNamespace(m.namespace)) + if err != nil { + log.Log.Error(err, "failed to get node maintenance objects") + return nil, err + } + + for _, obj := range list.Items { + if obj.Spec.RequestorID == consts.MaintenanceRequestor && obj.Spec.NodeName == m.nodeName { + return &obj, nil + } + } + + return nil, nil +} + +func (m maintenanceManager) ScheduleMaintenance(ctx context.Context) error { + log.Log.Info("maintenanceManager.ScheduleMaintenance()") + + scheduledMaintenance, err := m.getNodeMaintenanceObject(ctx) + if err != nil { + log.Log.Error(err, "failed to schedule node maintenance") + return err + } + + if scheduledMaintenance != nil { + // Maintenance already scheduled by us, nothing to do + return nil + } + + maintenanceRequest := &maintenanceoperator.NodeMaintenance{ + ObjectMeta: metav1.ObjectMeta{ + Name: consts.MaintenanceRequestName, + Namespace: m.namespace, + }, + Spec: maintenanceoperator.NodeMaintenanceSpec{ + RequestorID: consts.MaintenanceRequestor, + AdditionalRequestors: nil, + NodeName: m.nodeName, + Cordon: true, + WaitForPodCompletion: nil, + DrainSpec: &maintenanceoperator.DrainSpec{ + Force: true, + DeleteEmptyDir: true, + }, + }, + } + + err = m.client.Create(ctx, maintenanceRequest) + if err != nil { + log.Log.Error(err, "failed to schedule node maintenance") + return err + } + + return nil +} + +func (m maintenanceManager) MaintenanceAllowed(ctx context.Context) (bool, error) { + log.Log.Info("maintenanceManager.MaintenanceAllowed()") + scheduledMaintenance, err := m.getNodeMaintenanceObject(ctx) + if err != nil { + log.Log.Error(err, "failed to get node maintenance") + return false, err + } + + if scheduledMaintenance == nil { + // We want to perform maintenance on NICs only when node is properly prepared + return false, nil + } + + readyCondition := meta.FindStatusCondition(scheduledMaintenance.Status.Conditions, maintenanceoperator.ConditionTypeReady) + if readyCondition == nil { + log.Log.V(2).Info("couldn't retrieve maintenance condition, retry") + return false, nil + } + + if readyCondition.Status != metav1.ConditionTrue { + log.Log.V(2).Info("maintenance is not ready yet", "reason", readyCondition.Reason, "message", readyCondition.Message) + return false, nil + } + + return true, nil +} + +func (m maintenanceManager) ReleaseMaintenance(ctx context.Context) error { + log.Log.Info("maintenanceManager.ReleaseMaintenance()") + + scheduledMaintenance, err := m.getNodeMaintenanceObject(ctx) + if err != nil { + log.Log.Error(err, "failed to get node maintenance") + return err + } + + if scheduledMaintenance != nil { + err = m.client.Delete(ctx, scheduledMaintenance) + if err != nil { + log.Log.Error(err, "failed to release node maintenance") + return err + } + } + + return nil +} + +func (m maintenanceManager) Reboot() error { + log.Log.Info("maintenanceManager.Reboot()") + + return m.hostUtils.ScheduleReboot() +} + +func New(client client.Client, hostUtils host.HostUtils, nodeName string, namespace string) MaintenanceManager { + return maintenanceManager{client: client, hostUtils: hostUtils, nodeName: nodeName, namespace: namespace} +} diff --git a/pkg/maintenance/mocks/MaintenanceManager.go b/pkg/maintenance/mocks/MaintenanceManager.go new file mode 100644 index 0000000..9d66cc7 --- /dev/null +++ b/pkg/maintenance/mocks/MaintenanceManager.go @@ -0,0 +1,110 @@ +// Code generated by mockery v2.45.0. DO NOT EDIT. + +package mocks + +import ( + context "context" + + mock "github.com/stretchr/testify/mock" +) + +// MaintenanceManager is an autogenerated mock type for the MaintenanceManager type +type MaintenanceManager struct { + mock.Mock +} + +// MaintenanceAllowed provides a mock function with given fields: ctx +func (_m *MaintenanceManager) MaintenanceAllowed(ctx context.Context) (bool, error) { + ret := _m.Called(ctx) + + if len(ret) == 0 { + panic("no return value specified for MaintenanceAllowed") + } + + var r0 bool + var r1 error + if rf, ok := ret.Get(0).(func(context.Context) (bool, error)); ok { + return rf(ctx) + } + if rf, ok := ret.Get(0).(func(context.Context) bool); ok { + r0 = rf(ctx) + } else { + r0 = ret.Get(0).(bool) + } + + if rf, ok := ret.Get(1).(func(context.Context) error); ok { + r1 = rf(ctx) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Reboot provides a mock function with given fields: +func (_m *MaintenanceManager) Reboot() error { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for Reboot") + } + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// ReleaseMaintenance provides a mock function with given fields: ctx +func (_m *MaintenanceManager) ReleaseMaintenance(ctx context.Context) error { + ret := _m.Called(ctx) + + if len(ret) == 0 { + panic("no return value specified for ReleaseMaintenance") + } + + var r0 error + if rf, ok := ret.Get(0).(func(context.Context) error); ok { + r0 = rf(ctx) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// ScheduleMaintenance provides a mock function with given fields: ctx +func (_m *MaintenanceManager) ScheduleMaintenance(ctx context.Context) error { + ret := _m.Called(ctx) + + if len(ret) == 0 { + panic("no return value specified for ScheduleMaintenance") + } + + var r0 error + if rf, ok := ret.Get(0).(func(context.Context) error); ok { + r0 = rf(ctx) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// NewMaintenanceManager creates a new instance of MaintenanceManager. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func NewMaintenanceManager(t interface { + mock.TestingT + Cleanup(func()) +}) *MaintenanceManager { + mock := &MaintenanceManager{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/pkg/testutils/testutils.go b/pkg/testutils/testutils.go new file mode 100644 index 0000000..9ec32dd --- /dev/null +++ b/pkg/testutils/testutils.go @@ -0,0 +1,71 @@ +/* +2024 NVIDIA CORPORATION & AFFILIATES +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testutils + +import ( + "github.com/onsi/gomega/format" + "github.com/onsi/gomega/types" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func MatchCondition(condition metav1.Condition) types.GomegaMatcher { + return &conditionMatcher{condition: condition} +} + +type conditionMatcher struct { + condition metav1.Condition +} + +func (matcher *conditionMatcher) Match(actual interface{}) (success bool, err error) { + if actual == nil { + return false, nil + } + + actualConditions := actual.([]metav1.Condition) + + if len(actualConditions) == 0 { + return false, nil + } + + desiredCondition := meta.FindStatusCondition(actualConditions, matcher.condition.Type) + + if desiredCondition == nil { + return false, nil + } + + if desiredCondition.Status != matcher.condition.Status { + return false, nil + } + + if desiredCondition.Reason != matcher.condition.Reason { + return false, nil + } + + if desiredCondition.Message != matcher.condition.Message { + return false, nil + } + + return true, nil +} + +func (matcher *conditionMatcher) FailureMessage(actual interface{}) (message string) { + return format.Message(actual, "to match condition", matcher.condition) +} + +func (matcher *conditionMatcher) NegatedFailureMessage(actual interface{}) (message string) { + return format.Message(actual, "not to match condition", matcher.condition) +} diff --git a/pkg/types/types.go b/pkg/types/types.go new file mode 100644 index 0000000..eb22e2f --- /dev/null +++ b/pkg/types/types.go @@ -0,0 +1,45 @@ +/* +2024 NVIDIA CORPORATION & AFFILIATES +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +import ( + "fmt" + "strings" +) + +type NvConfigQuery struct { + DefaultConfig map[string]string + CurrentConfig map[string]string + NextBootConfig map[string]string +} + +func NewNvConfigQuery() NvConfigQuery { + return NvConfigQuery{ + DefaultConfig: make(map[string]string), + CurrentConfig: make(map[string]string), + NextBootConfig: make(map[string]string), + } +} + +const IncorrectSpecErrorPrefix = "incorrect spec" + +func IncorrectSpecError(msg string) error { + return fmt.Errorf("%s: %s", IncorrectSpecErrorPrefix, msg) +} + +func IsIncorrectSpecError(err error) bool { + return strings.HasPrefix(err.Error(), IncorrectSpecErrorPrefix) +}