Skip to content

Commit

Permalink
feat: Provisioning for fw binaries
Browse files Browse the repository at this point in the history
Introduced two new CRDs: NicFirmwareSource and NicFirmwareTemplate
New controller reconciling NicFirmwareSources
firmware package with logic for provisioning firmware files on a shared storage
unit tests

Signed-off-by: Alexander Maslennikov <amaslennikov@nvidia.com>
  • Loading branch information
almaslennikov committed Jan 20, 2025
1 parent 26b9aa9 commit aa7dc94
Show file tree
Hide file tree
Showing 16 changed files with 2,171 additions and 3 deletions.
7 changes: 4 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ COPY ./ ./
#RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/maintenance-manager/main.go
RUN --mount=type=cache,target=/go/pkg/mod/ GO_GCFLAGS=${GCFLAGS} make build-manager

# Use distroless as minimal base image to package the manager binary
# Refer to https://github.com/GoogleContainerTools/distroless for more details
FROM gcr.io/distroless/static:nonroot
FROM quay.io/centos/centos:stream9

RUN yum -y install mstflint && yum clean all

WORKDIR /
COPY --from=builder /workspace/build/manager .
COPY bindata /bindata
Expand Down
10 changes: 10 additions & 0 deletions cmd/manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (

configurationnetv1alpha1 "github.com/Mellanox/nic-configuration-operator/api/v1alpha1"
"github.com/Mellanox/nic-configuration-operator/internal/controller"
"github.com/Mellanox/nic-configuration-operator/pkg/firmware"
"github.com/Mellanox/nic-configuration-operator/pkg/ncolog"
"github.com/Mellanox/nic-configuration-operator/pkg/version"
//+kubebuilder:scaffold:imports
Expand Down Expand Up @@ -145,6 +146,15 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "NicConfigurationTemplate")
os.Exit(1)
}

if err = (&controller.NicFirmwareSourceReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
FirmwareProvisioner: firmware.NewFirmwareProvisioner(),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NicFirmwareSource")
os.Exit(1)
}
//+kubebuilder:scaffold:builder

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
Expand Down
52 changes: 52 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,58 @@ rules:
- get
- patch
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaresources
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaresources/finalizers
verbs:
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaresources/status
verbs:
- get
- patch
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaretemplates
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaretemplates/finalizers
verbs:
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaretemplates/status
verbs:
- get
- patch
- update
- apiGroups:
- coordination.k8s.io
resources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,12 @@ spec:
periodSeconds: 10
resources:
{{- toYaml .Values.resources | nindent 12 }}
volumeMounts:
- name: firmware-cache
mountPath: /nic-firmware
readOnly: false
# TODO change to user-specified storage class
volumes:
- name: firmware-cache
hostPath:
path: /tmp
60 changes: 60 additions & 0 deletions deployment/nic-configuration-operator-chart/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,63 @@ rules:
- patch
- update
- watch
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaresources
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaresources/finalizers
verbs:
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaresources/status
verbs:
- get
- patch
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaretemplates
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaretemplates/finalizers
verbs:
- update
- apiGroups:
- configuration.net.nvidia.com
resources:
- nicfirmwaretemplates/status
verbs:
- get
- patch
- update
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- create
- get
- update
6 changes: 6 additions & 0 deletions internal/controller/nicconfigurationtemplate_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ type NicConfigurationTemplateReconciler struct {
//+kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicdevices/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicdevices,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicdevices/finalizers,verbs=update
//+kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaretemplates,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaretemplates/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaretemplates/finalizers,verbs=update
//+kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaresources/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaresources,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaresources/finalizers,verbs=update
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch
//+kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch
//+kubebuilder:rbac:groups="",resources=events,verbs=create
Expand Down
144 changes: 144 additions & 0 deletions internal/controller/nicfirmwaresource_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
/*
2025 NVIDIA CORPORATION & AFFILIATES
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controller

import (
"context"

"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/Mellanox/nic-configuration-operator/api/v1alpha1"
"github.com/Mellanox/nic-configuration-operator/pkg/consts"
"github.com/Mellanox/nic-configuration-operator/pkg/firmware"
)

// NicFirmwareSourceReconciler reconciles a NicDevice object
type NicFirmwareSourceReconciler struct {
client.Client
Scheme *runtime.Scheme

FirmwareProvisioner firmware.FirmwareProvisioner
}

// Reconcile reconciles the NicFirmwareSource object
func (r *NicFirmwareSourceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
// Fetch the HostDeviceNetwork instance
instance := &v1alpha1.NicFirmwareSource{}
err := r.Get(ctx, req.NamespacedName, instance)

// TODO use finalizers to clean up cache storage after CR deletion

if err != nil {
if errors.IsNotFound(err) {
// Request object not found, could have been deleted after reconcile request.
// Owned objects are automatically garbage collected.
// Return and don't requeue
return reconcile.Result{}, nil
}
return reconcile.Result{}, err
}

cacheName := instance.Name

urlsToProcess, err := r.FirmwareProvisioner.VerifyCachedBinaries(cacheName, instance.Spec.BinUrlSource)
if err != nil {
if err = r.updateStatus(ctx, instance, consts.FirmwareSourceCacheVerificationFailedStatus, err, nil); err != nil {
return reconcile.Result{}, err
}
return reconcile.Result{}, err
}
if len(urlsToProcess) != 0 {
if err = r.updateStatus(ctx, instance, consts.FirmwareSourceDownloadingStatus, nil, nil); err != nil {
return reconcile.Result{}, err
}

err = r.FirmwareProvisioner.DownloadAndUnzipFirmwareArchives(cacheName, urlsToProcess, true)
if err != nil {
if err = r.updateStatus(ctx, instance, consts.FirmwareSourceDownloadFailedStatus, err, nil); err != nil {
return reconcile.Result{}, err
}
return reconcile.Result{}, err
}
} else {
log.Log.Info("Files for all requested URLs already present, skipping download", "cacheName", instance.Name)
}

if err = r.updateStatus(ctx, instance, consts.FirmwareSourceProcessingStatus, nil, nil); err != nil {
return reconcile.Result{}, err
}

err = r.FirmwareProvisioner.AddFirmwareBinariesToCacheByMetadata(cacheName)
if err != nil {
if err = r.updateStatus(ctx, instance, consts.FirmwareSourceProcessingFailedStatus, err, nil); err != nil {
return reconcile.Result{}, err
}
return reconcile.Result{}, err
}

return r.ValidateCache(ctx, instance)
}

func (r *NicFirmwareSourceReconciler) ValidateCache(ctx context.Context, instance *v1alpha1.NicFirmwareSource) (reconcile.Result, error) {
versions, err := r.FirmwareProvisioner.ValidateCache(instance.Name)
if err != nil {
if err = r.updateStatus(ctx, instance, consts.FirmwareSourceProcessingFailedStatus, err, nil); err != nil {
return reconcile.Result{}, err
}
return reconcile.Result{}, err
}

if err = r.updateStatus(ctx, instance, consts.FirmwareSourceSuccessStatus, nil, versions); err != nil {
return reconcile.Result{}, err
}

return ctrl.Result{}, nil
}

func (r *NicFirmwareSourceReconciler) updateStatus(ctx context.Context, obj *v1alpha1.NicFirmwareSource, status string, statusError error, versions map[string][]string) error {
// We change the status of the object several times during the reconciliation, need to get the latest version first
err := r.Get(ctx, types.NamespacedName{Name: obj.Name, Namespace: obj.Namespace}, obj)
if err != nil {
return err
}

obj.Status.State = status
if statusError != nil {
obj.Status.Reason = statusError.Error()
} else {
obj.Status.Reason = ""
}

obj.Status.Versions = versions
return r.Status().Update(ctx, obj)
}

// SetupWithManager sets up the controller with the Manager.
func (r *NicFirmwareSourceReconciler) SetupWithManager(mgr ctrl.Manager) error {
controller := ctrl.NewControllerManagedBy(mgr).
For(&v1alpha1.NicFirmwareSource{}, builder.WithPredicates(predicate.GenerationChangedPredicate{}))

return controller.
Named("nicFirmwareSourceReconciler").
Complete(r)
}
Loading

0 comments on commit aa7dc94

Please sign in to comment.