Skip to content

Commit

Permalink
Expose device UUIDs to node label
Browse files Browse the repository at this point in the history
Signed-off-by: Zubiao Xiong <zubiao.xiong@memverge.com>
  • Loading branch information
xiongzubiao committed Jan 21, 2025
1 parent f7dc5f1 commit 89cd885
Show file tree
Hide file tree
Showing 11 changed files with 86 additions and 0 deletions.
19 changes: 19 additions & 0 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return nil, fmt.Errorf("error creating IMEX labeler: %v", err)
}

uuidLabler, err := newGPUUUIDLabeler(devices)
if err != nil {
return nil, fmt.Errorf("error creating UUID labeler: %v", err)
}

l := Merge(
machineTypeLabeler,
versionLabeler,
Expand All @@ -93,6 +98,7 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
resourceLabeler,
gpuModeLabeler,
imexLabeler,
uuidLabler,
)

return l, nil
Expand Down Expand Up @@ -261,3 +267,16 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
}
return classes, nil
}

// newGPUUUIDLabeler creates a new labeler that reports the UUIDs of GPUs on the node.
func newGPUUUIDLabeler(devices []resource.Device) (Labeler, error) {
labels := make(Labels, len(devices))
for idx, d := range devices {
uuid, err := d.GetUUID()
if err != nil {
return nil, err
}
labels[fmt.Sprintf("nvidia.com/gpu-%d.uuid", idx)] = uuid
}
return labels, nil
}
37 changes: 37 additions & 0 deletions internal/resource/device_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions internal/resource/nvml-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,15 @@ func (d nvmlDevice) GetName() (string, error) {
return name, nil
}

// GetUUID returns the device UUID.
func (d nvmlDevice) GetUUID() (string, error) {
uuid, ret := d.Device.GetUUID()
if ret != nvml.SUCCESS {
return "", ret
}
return uuid, nil
}

// GetTotalMemoryMB returns the total memory on a device in MB
func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) {
info, ret := d.Device.GetMemoryInfo()
Expand Down
9 changes: 9 additions & 0 deletions internal/resource/nvml-mig-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,15 @@ func (d nvmlMigDevice) GetName() (string, error) {
return resourceName, nil
}

// GetUUID returns the UUID of the nvmlMigDevice.
func (d nvmlMigDevice) GetUUID() (string, error) {
uuid, ret := d.MigDevice.GetUUID()
if ret != nvml.SUCCESS {
return "", ret
}
return uuid, nil
}

// GetTotalMemoryMB returns the total memory on a device in MB
func (d nvmlMigDevice) GetTotalMemoryMB() (uint64, error) {
attr, err := d.GetAttributes()
Expand Down
6 changes: 6 additions & 0 deletions internal/resource/sysfs-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ func (d vfioDevice) GetName() (string, error) {
return d.nvidiaPCIDevice.DeviceName, nil
}

// GetUUID is unsupported for vfio devices
func (d vfioDevice) GetUUID() (string, error) {
return "", fmt.Errorf("GetUUID is not supported for vfio devices")
}

// GetTotalMemoryMB returns the total memory on a device in MB
func (d vfioDevice) GetTotalMemoryMB() (uint64, error) {
_, val := d.nvidiaPCIDevice.Resources.GetTotalAddressableMemory(true)
Expand All @@ -72,6 +77,7 @@ func (d vfioDevice) GetPCIClass() (uint32, error) {
func (d vfioDevice) IsFabricAttached() (bool, error) {
return false, nil
}

func (d vfioDevice) GetFabricIDs() (string, string, error) {
return "", "", fmt.Errorf("GetFabricIDs is not supported for vfio devices")
}
1 change: 1 addition & 0 deletions internal/resource/testing/resource-testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ func NewMigEnabledDevice(migs ...*resource.DeviceMock) resource.Device {
func NewDeviceMock(migEnabled bool) *DeviceMock {
d := DeviceMock{resource.DeviceMock{
GetNameFunc: func() (string, error) { return "MOCKMODEL", nil },
GetUUIDFunc: func() (string, error) { return "MOCKUUID", nil },
GetCudaComputeCapabilityFunc: func() (int, int, error) {
if migEnabled {
return 0, 0, nil
Expand Down
1 change: 1 addition & 0 deletions internal/resource/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ type Device interface {
GetMigDevices() ([]Device, error)
GetAttributes() (map[string]interface{}, error)
GetName() (string, error)
GetUUID() (string, error)
GetTotalMemoryMB() (uint64, error)
GetDeviceHandleFromMigDeviceHandle() (Device, error)
GetCudaComputeCapability() (int, int, error)
Expand Down
1 change: 1 addition & 0 deletions tests/expected-output-mig-mixed.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ nvidia\.com\/mig-[0-9]+g\.[0-9]+gb\.engines\.ofa=[0-9]+
nvidia\.com\/mig-[0-9]+g\.[0-9]+gb\.slices\.gi=[0-9]+
nvidia\.com\/mig-[0-9]+g\.[0-9]+gb\.slices\.ci=[0-9]+
nvidia\.com\/mps\.capable=[true|false]
nvidia\.com\/gpu-[0-9]+\.uuid=[0-9a-zA-Z-]+
1 change: 1 addition & 0 deletions tests/expected-output-mig-none.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ nvidia\.com\/mig\.capable=[true|false]
nvidia\.com\/gpu\.compute\.major=[0-9]+
nvidia\.com\/gpu\.compute\.minor=[0-9]+
nvidia\.com\/mps\.capable=[true|false]
nvidia\.com\/gpu-[0-9]+\.uuid=[0-9a-zA-Z-]+
1 change: 1 addition & 0 deletions tests/expected-output-mig-single.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@ nvidia\.com\/gpu\.slices\.gi=[0-9]+
nvidia\.com\/gpu\.slices\.ci=[0-9]+
nvidia\.com\/gpu\.mode=[compute]
nvidia\.com\/mps\.capable=[true|false]
nvidia\.com\/gpu-[0-9]+\.uuid=[0-9a-zA-Z-]+
1 change: 1 addition & 0 deletions tests/expected-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ nvidia\.com\/mig\.capable=[true|false]
nvidia\.com\/gpu\.compute\.major=[0-9]+
nvidia\.com\/gpu\.compute\.minor=[0-9]+
nvidia\.com\/mps\.capable=[true|false]
nvidia\.com\/gpu-[0-9]+\.uuid=[0-9a-zA-Z-]+

0 comments on commit 89cd885

Please sign in to comment.