Skip to content

Commit

Permalink
Merge pull request #13 from almaslennikov/nic-device-controller
Browse files Browse the repository at this point in the history
Add Nic Device Controller
  • Loading branch information
almaslennikov authored Sep 21, 2024
2 parents e1a4b36 + 0744b8d commit 71b57c4
Show file tree
Hide file tree
Showing 27 changed files with 5,117 additions and 133 deletions.
4 changes: 4 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ issues:
linters:
- dupl
- lll
- path: "pkg/*"
linters:
- dupl
- lll
linters:
disable-all: true
enable:
Expand Down
26 changes: 25 additions & 1 deletion Dockerfile.nic-configuration-daemon
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,31 @@ COPY ./ ./
RUN --mount=type=cache,target=/go/pkg/mod/ GO_GCFLAGS=${GCFLAGS} make build-daemon

FROM quay.io/centos/centos:stream9
RUN yum -y install hwdata mstflint && yum clean all
ARG TARGETARCH
ENV RHEL_VERSION=9.4
ENV OFED_PACKAGE_MAJOR_VERSION=24.07
ENV OFED_PACKAGE_MINOR_VERSION=0.6.1.0
ENV MFT_VERSION=4.29.0-131
ENV MLNX_TOOLS_VERSION=0.2407061

RUN yum -y install hwdata mstflint wget pciutils procps-ng kmod systemd && yum clean all

RUN ARCH_SUFFIX="${TARGETARCH}" \
&& ARCH_SUFFIX="${ARCH_SUFFIX//amd64/x86_64}" \
&& ARCH_SUFFIX="${ARCH_SUFFIX//arm64/aarch64}" \
&& wget https://linux.mellanox.com/public/repo/mlnx_ofed/${OFED_PACKAGE_MAJOR_VERSION}-${OFED_PACKAGE_MINOR_VERSION}/rhel${RHEL_VERSION}/${ARCH_SUFFIX}/mlnx-tools-${OFED_PACKAGE_MAJOR_VERSION}-${MLNX_TOOLS_VERSION}.${ARCH_SUFFIX}.rpm \
&& rpm -i mlnx-tools-${OFED_PACKAGE_MAJOR_VERSION}-${MLNX_TOOLS_VERSION}.${ARCH_SUFFIX}.rpm \
&& rm mlnx-tools-${OFED_PACKAGE_MAJOR_VERSION}-${MLNX_TOOLS_VERSION}.${ARCH_SUFFIX}.rpm

RUN ARCH_SUFFIX1="${TARGETARCH}" \
&& ARCH_SUFFIX1="${ARCH_SUFFIX1//amd64/x86_64}" \
&& ARCH_SUFFIX1="${ARCH_SUFFIX1//arm64/aarch64}" \
&& ARCH_SUFFIX2="${TARGETARCH}" \
&& ARCH_SUFFIX2="${ARCH_SUFFIX2//amd64/x86_64}" \
&& wget https://linux.mellanox.com/public/repo/mlnx_ofed/${OFED_PACKAGE_MAJOR_VERSION}-${OFED_PACKAGE_MINOR_VERSION}/rhel${RHEL_VERSION}/${ARCH_SUFFIX1}/mft-${MFT_VERSION}.${ARCH_SUFFIX2}.rpm \
&& rpm -i mft-${MFT_VERSION}.${ARCH_SUFFIX2}.rpm \
&& rm mft-${MFT_VERSION}.${ARCH_SUFFIX2}.rpm

WORKDIR /
COPY --from=builder /workspace/build/nic-configuration-daemon .

Expand Down
2 changes: 1 addition & 1 deletion api/v1alpha1/nicconfigurationtemplate_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ type NicSelectorSpec struct {
// +enum
type LinkTypeEnum string

// PciPerformanceOptimizedSpec specifies PCI performace optimization settings
// PciPerformanceOptimizedSpec specifies PCI performance optimization settings
type PciPerformanceOptimizedSpec struct {
// Specifies whether to enable PCI performance optimization
Enabled bool `json:"enabled"`
Expand Down
36 changes: 23 additions & 13 deletions cmd/nic-configuration-daemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,18 @@ import (
"flag"
"os"

"github.com/Mellanox/nic-configuration-operator/internal/controller"
maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"

"github.com/Mellanox/nic-configuration-operator/api/v1alpha1"
"github.com/Mellanox/nic-configuration-operator/internal/controller"
"github.com/Mellanox/nic-configuration-operator/pkg/host"
"github.com/Mellanox/nic-configuration-operator/pkg/maintenance"
"github.com/Mellanox/nic-configuration-operator/pkg/ncolog"
)

Expand All @@ -25,17 +28,9 @@ func main() {
flag.Parse()
ncolog.InitLog()

err := clientgoscheme.AddToScheme(scheme)
if err != nil {
log.Log.Error(err, "failed to load client-go to scheme")
os.Exit(1)
}

err = v1alpha1.AddToScheme(scheme)
if err != nil {
log.Log.Error(err, "failed to load nic configuration CRDs to scheme")
os.Exit(1)
}
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(maintenanceoperator.AddToScheme(scheme))
utilruntime.Must(v1alpha1.AddToScheme(scheme))

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
Expand All @@ -59,14 +54,29 @@ func main() {
}

hostUtils := host.NewHostUtils()
hostManager := host.NewHostManager(hostUtils)
hostManager := host.NewHostManager(nodeName, hostUtils)
maintenanceManager := maintenance.New(mgr.GetClient(), hostUtils, nodeName, namespace)

deviceDiscovery := controller.NewDeviceRegistry(mgr.GetClient(), hostManager, nodeName, namespace)
if err = mgr.Add(deviceDiscovery); err != nil {
log.Log.Error(err, "unable to add device discovery runnable")
os.Exit(1)
}

nicDeviceReconciler := controller.NicDeviceReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
NodeName: nodeName,
NamespaceName: namespace,
HostManager: hostManager,
MaintenanceManager: maintenanceManager,
}
err = nicDeviceReconciler.SetupWithManager(mgr, true)
if err != nil {
log.Log.Error(err, "unable to create controller", "controller", "NicDeviceReconciler")
os.Exit(1)
}

ctx := ctrl.SetupSignalHandler()

err = mgr.GetCache().IndexField(ctx, &v1alpha1.NicDevice{}, "status.node", func(o client.Object) []string {
Expand Down
12 changes: 12 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,15 @@ rules:
- get
- patch
- update
- apiGroups:
- maintenance.nvidia.com
resources:
- nodemaintenances
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
Binary file added docs/nic-configuration-reconcile-diagram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.22.0
toolchain go1.22.4

require (
github.com/Mellanox/maintenance-operator/api v0.0.0-20240916123230-810ab7bb25f4
github.com/Mellanox/rdmamap v1.1.0
github.com/jaypipes/ghw v0.12.0
github.com/jaypipes/pcidb v1.0.1
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
github.com/Mellanox/maintenance-operator/api v0.0.0-20240916123230-810ab7bb25f4 h1:XTyFEogTo9v/lZXMqKroHSpVimDxYOHvTdwScJHA7v0=
github.com/Mellanox/maintenance-operator/api v0.0.0-20240916123230-810ab7bb25f4/go.mod h1:5OIBO4beWexC3JvLIH1GGNzr49QW7UoZe2LgT/IXYIc=
github.com/Mellanox/rdmamap v1.1.0 h1:A/W1wAXw+6vm58f3VklrIylgV+eDJlPVIMaIKuxgUT4=
github.com/Mellanox/rdmamap v1.1.0/go.mod h1:fN+/V9lf10ABnDCwTaXRjeeWijLt2iVLETnK+sx/LY8=
github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
Expand Down
4 changes: 4 additions & 0 deletions internal/controller/devicediscovery_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ func (d *DeviceDiscovery) reconcile(ctx context.Context) error {
return err
}

log.Log.V(2).Info("listed devices", "devices", list.Items)

node := &v1.Node{}
err = d.Client.Get(ctx, types.NamespacedName{Name: d.nodeName}, node)
if err != nil {
Expand All @@ -93,6 +95,7 @@ func (d *DeviceDiscovery) reconcile(ctx context.Context) error {
observedDeviceStatus, exists := observedDevices[nicDeviceCR.Status.SerialNumber]

if !exists {
log.Log.V(2).Info("device doesn't exist on the node anymore, deleting", "device", nicDeviceCR.Name)
// Need to delete this CR, it doesn't represent the observedDevice on host anymore
err = d.Client.Delete(ctx, &nicDeviceCR)
if err != nil {
Expand All @@ -106,6 +109,7 @@ func (d *DeviceDiscovery) reconcile(ctx context.Context) error {
observedDeviceStatus.Conditions = nicDeviceCR.Status.Conditions

if !reflect.DeepEqual(nicDeviceCR.Status, observedDeviceStatus) {
log.Log.V(2).Info("device status changed, updating", "device", nicDeviceCR.Name, "crStatus", nicDeviceCR.Status, "observedStatus", observedDeviceStatus)
// Status of the device changes, need to update the CR
nicDeviceCR.Status = observedDeviceStatus

Expand Down
9 changes: 6 additions & 3 deletions internal/controller/nicconfigurationtemplate_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ type NicConfigurationTemplateReconciler struct {
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch
//+kubebuilder:rbac:groups="",resources=pods,verbs=list
//+kubebuilder:rbac:groups="",resources=pods/eviction,verbs=create;delete;get;list;patch;update;watch
//+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances,verbs=get;list;watch;create;update;patch;delete

// Reconcile reconciles the NicConfigurationTemplate object
func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
Expand Down Expand Up @@ -89,11 +90,13 @@ func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req

nodeMap := map[string]*v1.Node{}
for _, node := range nodeList.Items {
node := node
nodeMap[node.Name] = &node
}

templates := []*v1alpha1.NicConfigurationTemplate{}
for _, template := range templateList.Items {
template := template
templates = append(templates, &template)
}

Expand All @@ -108,7 +111,7 @@ func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req

for _, template := range templates {
if !deviceMatchesSelectors(&device, template, node) {
r.dropDeviceFromStatus(ctx, device.Name, template)
r.dropDeviceFromStatus(device.Name, template)

continue
}
Expand All @@ -129,7 +132,7 @@ func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req

if len(matchingTemplates) > 1 {
for _, template := range matchingTemplates {
r.dropDeviceFromStatus(ctx, device.Name, template)
r.dropDeviceFromStatus(device.Name, template)
}

templateNames := []string{}
Expand Down Expand Up @@ -171,7 +174,7 @@ func (r *NicConfigurationTemplateReconciler) Reconcile(ctx context.Context, req
return ctrl.Result{}, nil
}

func (r *NicConfigurationTemplateReconciler) dropDeviceFromStatus(ctx context.Context, deviceName string, template *v1alpha1.NicConfigurationTemplate) {
func (r *NicConfigurationTemplateReconciler) dropDeviceFromStatus(deviceName string, template *v1alpha1.NicConfigurationTemplate) {
index := slices.Index(template.Status.NicDevices, deviceName)
if index != -1 {
// Device no longer matches template, drop it from the template's status
Expand Down
Loading

0 comments on commit 71b57c4

Please sign in to comment.