diff --git a/config/crds/troubleshoot.sh_analyzers.yaml b/config/crds/troubleshoot.sh_analyzers.yaml index 4a3b81f76..354a249be 100644 --- a/config/crds/troubleshoot.sh_analyzers.yaml +++ b/config/crds/troubleshoot.sh_analyzers.yaml @@ -1176,6 +1176,10 @@ spec: type: string ephemeralStorageCapacity: type: string + gpuAllocatable: + type: string + gpuCapacity: + type: string memoryAllocatable: type: string memoryCapacity: diff --git a/config/crds/troubleshoot.sh_preflights.yaml b/config/crds/troubleshoot.sh_preflights.yaml index 7b9f23bf9..1e2712491 100644 --- a/config/crds/troubleshoot.sh_preflights.yaml +++ b/config/crds/troubleshoot.sh_preflights.yaml @@ -1176,6 +1176,10 @@ spec: type: string ephemeralStorageCapacity: type: string + gpuAllocatable: + type: string + gpuCapacity: + type: string memoryAllocatable: type: string memoryCapacity: diff --git a/config/crds/troubleshoot.sh_supportbundles.yaml b/config/crds/troubleshoot.sh_supportbundles.yaml index bca8003a0..1541f4333 100644 --- a/config/crds/troubleshoot.sh_supportbundles.yaml +++ b/config/crds/troubleshoot.sh_supportbundles.yaml @@ -1207,6 +1207,10 @@ spec: type: string ephemeralStorageCapacity: type: string + gpuAllocatable: + type: string + gpuCapacity: + type: string memoryAllocatable: type: string memoryCapacity: diff --git a/pkg/analyze/node_resources.go b/pkg/analyze/node_resources.go index 1f6e62c6f..f4eb4917e 100644 --- a/pkg/analyze/node_resources.go +++ b/pkg/analyze/node_resources.go @@ -18,6 +18,8 @@ import ( "github.com/replicatedhq/troubleshoot/pkg/constants" ) +const gpuResourceName = "nvidia.com/gpu" + type AnalyzeNodeResources struct { analyzer *troubleshootv1beta2.NodeResources } @@ -329,6 +331,10 @@ func getQuantity(node corev1.Node, property string) *resource.Quantity { return node.Status.Capacity.StorageEphemeral() case "ephemeralStorageAllocatable": return node.Status.Allocatable.StorageEphemeral() + case "gpuCapacity": + return node.Status.Capacity.Name(gpuResourceName, resource.DecimalSI) + case "gpuAllocatable": + return node.Status.Allocatable.Name(gpuResourceName, resource.DecimalSI) } return nil } @@ -492,5 +498,26 @@ func nodeMatchesFilters(node corev1.Node, filters *troubleshootv1beta2.NodeResou } } + if filters.GPUCapacity != "" { + parsed, err := resource.ParseQuantity(filters.GPUCapacity) + if err != nil { + return false, errors.Wrap(err, "failed to parse gpu capacity") + } + + if node.Status.Capacity.Name(gpuResourceName, resource.DecimalSI).Cmp(parsed) == -1 { + return false, nil + } + } + if filters.GPUAllocatable != "" { + parsed, err := resource.ParseQuantity(filters.GPUAllocatable) + if err != nil { + return false, errors.Wrap(err, "failed to parse gpu allocatable") + } + + if node.Status.Allocatable.Name(gpuResourceName, resource.DecimalSI).Cmp(parsed) == -1 { + return false, nil + } + } + return true, nil } diff --git a/pkg/analyze/node_resources_test.go b/pkg/analyze/node_resources_test.go index 204840843..dbb45a0d7 100644 --- a/pkg/analyze/node_resources_test.go +++ b/pkg/analyze/node_resources_test.go @@ -14,7 +14,7 @@ import ( func Test_compareNodeResourceConditionalToActual(t *testing.T) { nodeData := []corev1.Node{ - corev1.Node{ + { TypeMeta: metav1.TypeMeta{ APIVersion: "v1", Kind: "Node", @@ -28,16 +28,18 @@ func Test_compareNodeResourceConditionalToActual(t *testing.T) { "ephemeral-storage": resource.MustParse("20959212Ki"), "memory": resource.MustParse("3999Ki"), "pods": resource.MustParse("15"), + gpuResourceName: resource.MustParse("4"), }, Allocatable: corev1.ResourceList{ "cpu": resource.MustParse("1.5"), "ephemeral-storage": resource.MustParse("19316009748"), "memory": resource.MustParse("16Ki"), "pods": resource.MustParse("14"), + gpuResourceName: resource.MustParse("4"), }, }, }, - corev1.Node{ + { TypeMeta: metav1.TypeMeta{ APIVersion: "v1", Kind: "Node", @@ -366,6 +368,70 @@ func Test_compareNodeResourceConditionalToActual(t *testing.T) { expected: false, isError: true, }, + { + name: "sum(gpuCapacity) > 1 (true)", + conditional: "sum(gpuCapacity) > 1", + matchingNodes: nodeData, + totalNodeCount: len(nodeData), + expected: true, + isError: false, + }, + { + name: "sum(gpuCapacity) >= 8 (false)", + conditional: "sum(gpuCapacity) >= 8", + matchingNodes: nodeData, + totalNodeCount: len(nodeData), + expected: false, + isError: false, + }, + { + name: "min(gpuCapacity) > 1 (false)", + conditional: "min(gpuCapacity) > 1", + matchingNodes: nodeData, + totalNodeCount: len(nodeData), + expected: false, + isError: false, + }, + { + name: "min(gpuAllocatable) > 1 (false)", + conditional: "min(gpuAllocatable) > 1", + matchingNodes: nodeData, + totalNodeCount: len(nodeData), + expected: false, + isError: false, + }, + { + name: "max(gpuCapacity) == 4 (true)", + conditional: "max(gpuCapacity) == 4", + matchingNodes: nodeData, + totalNodeCount: len(nodeData), + expected: true, + isError: false, + }, + { + name: "max(gpuAllocatable) == 4 (true)", + conditional: "max(gpuAllocatable) == 4", + matchingNodes: nodeData, + totalNodeCount: len(nodeData), + expected: true, + isError: false, + }, + { + name: "sum(gpuAllocatable) > 1 (true)", + conditional: "sum(gpuAllocatable) > 1", + matchingNodes: nodeData, + totalNodeCount: len(nodeData), + expected: true, + isError: false, + }, + { + name: "sum(gpuAllocatable) >= 8 (false)", + conditional: "sum(gpuAllocatable) >= 8", + matchingNodes: nodeData, + totalNodeCount: len(nodeData), + expected: false, + isError: false, + }, } for _, test := range tests { @@ -404,6 +470,7 @@ func Test_nodeMatchesFilters(t *testing.T) { "hugepages-2Mi": resource.MustParse("0"), "memory": resource.MustParse("7951376Ki"), "pods": resource.MustParse("29"), + gpuResourceName: resource.MustParse("1"), }, Allocatable: corev1.ResourceList{ "attachable-volumes-aws-ebs": resource.MustParse("25"), @@ -413,6 +480,7 @@ func Test_nodeMatchesFilters(t *testing.T) { "hugepages-2Mi": resource.MustParse("0"), "memory": resource.MustParse("7848976Ki"), "pods": resource.MustParse("29"), + gpuResourceName: resource.MustParse("1"), }, }, } @@ -626,6 +694,38 @@ func Test_nodeMatchesFilters(t *testing.T) { }, expectResult: false, }, + { + name: "true when gpu capacity is available", + node: node, + filters: &troubleshootv1beta2.NodeResourceFilters{ + GPUCapacity: "1", + }, + expectResult: true, + }, + { + name: "true when allocatable gpu is available", + node: node, + filters: &troubleshootv1beta2.NodeResourceFilters{ + GPUAllocatable: "1", + }, + expectResult: true, + }, + { + name: "false when gpu capacity is not available", + node: node, + filters: &troubleshootv1beta2.NodeResourceFilters{ + GPUCapacity: "2", + }, + expectResult: false, + }, + { + name: "false when allocatable gpu is not available", + node: node, + filters: &troubleshootv1beta2.NodeResourceFilters{ + GPUAllocatable: "2", + }, + expectResult: false, + }, } for _, test := range tests { diff --git a/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go b/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go index 0ce3e8f24..f1b459df2 100644 --- a/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go +++ b/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go @@ -131,6 +131,8 @@ type NodeResourceFilters struct { PodAllocatable string `json:"podAllocatable,omitempty" yaml:"podAllocatable,omitempty"` EphemeralStorageCapacity string `json:"ephemeralStorageCapacity,omitempty" yaml:"ephemeralStorageCapacity,omitempty"` EphemeralStorageAllocatable string `json:"ephemeralStorageAllocatable,omitempty" yaml:"ephemeralStorageAllocatable,omitempty"` + GPUCapacity string `json:"gpuCapacity,omitempty" yaml:"gpuCapacity,omitempty"` + GPUAllocatable string `json:"gpuAllocatable,omitempty" yaml:"gpuAllocatable,omitempty"` Selector *NodeResourceSelectors `json:"selector,omitempty" yaml:"selector,omitempty"` } diff --git a/schemas/analyzer-troubleshoot-v1beta2.json b/schemas/analyzer-troubleshoot-v1beta2.json index 29d46b5ea..c88e432ed 100644 --- a/schemas/analyzer-troubleshoot-v1beta2.json +++ b/schemas/analyzer-troubleshoot-v1beta2.json @@ -1770,6 +1770,12 @@ "ephemeralStorageCapacity": { "type": "string" }, + "gpuAllocatable": { + "type": "string" + }, + "gpuCapacity": { + "type": "string" + }, "memoryAllocatable": { "type": "string" }, diff --git a/schemas/preflight-troubleshoot-v1beta2.json b/schemas/preflight-troubleshoot-v1beta2.json index 8e1e2af55..452f9db29 100644 --- a/schemas/preflight-troubleshoot-v1beta2.json +++ b/schemas/preflight-troubleshoot-v1beta2.json @@ -1770,6 +1770,12 @@ "ephemeralStorageCapacity": { "type": "string" }, + "gpuAllocatable": { + "type": "string" + }, + "gpuCapacity": { + "type": "string" + }, "memoryAllocatable": { "type": "string" }, diff --git a/schemas/supportbundle-troubleshoot-v1beta2.json b/schemas/supportbundle-troubleshoot-v1beta2.json index d7c25e705..f3e6f4d39 100644 --- a/schemas/supportbundle-troubleshoot-v1beta2.json +++ b/schemas/supportbundle-troubleshoot-v1beta2.json @@ -1816,6 +1816,12 @@ "ephemeralStorageCapacity": { "type": "string" }, + "gpuAllocatable": { + "type": "string" + }, + "gpuCapacity": { + "type": "string" + }, "memoryAllocatable": { "type": "string" },