From d8d75d4553019ec05f2ca15b8531eff413d20bf6 Mon Sep 17 00:00:00 2001 From: abhijeet-dhumal Date: Thu, 9 Jan 2025 16:20:23 +0530 Subject: [PATCH] Add KFTO pytorch multi-node multi-gpu tests for GPUs with AMD ROCm and NVIDIA Cuda --- .../test-run-training-stack-tests.robot | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ods_ci/tests/Tests/0600__distributed_workloads/0602__training/test-run-training-stack-tests.robot b/ods_ci/tests/Tests/0600__distributed_workloads/0602__training/test-run-training-stack-tests.robot index 763aa01a1..48c04d8fd 100644 --- a/ods_ci/tests/Tests/0600__distributed_workloads/0602__training/test-run-training-stack-tests.robot +++ b/ods_ci/tests/Tests/0600__distributed_workloads/0602__training/test-run-training-stack-tests.robot @@ -75,3 +75,15 @@ Run Training operator KFTO_MNIST multi-node test with AMD ROCm image ... Training ... TrainingOperator Run Training Operator KFTO Test TestPyTorchJobMnistMultiNodeWithROCm ${ROCM_TRAINING_IMAGE} + +Run Training operator KFTO_MNIST multi-node multi-gpu test with NVIDIA CUDA image + [Documentation] Run Go KFTO_MNIST multi-node multi-gpu test for Training operator using PyTorch job with NVIDIA CUDA image - It requires 2 cluster-nodes with 2 GPUs each + [Tags] Kfto-MultiNodeMultiGpu + ... Training + Run Training Operator KFTO Test TestPyTorchJobMnistMultiNodeMultiGpuWithCuda ${CUDA_TRAINING_IMAGE} + +Run Training operator KFTO_MNIST multi-node multi-gpu test with AMD ROCm image + [Documentation] Run Go KFTO_MNIST multi-node multi-gpu test for Training operator using PyTorch job with AMD ROCm image - It requires 2 cluster-nodes with 2 GPUs each + [Tags] Kfto-MultiNodeMultiGpu + ... Training + Run Training Operator KFTO Test TestPyTorchJobMnistMultiNodeMultiGpuWithROCm ${ROCM_TRAINING_IMAGE}