fabtests: Bugfixes for neuron

This commit fixes the following bugs in neuron fabtests 1. The neuron accelerator detection is broken on some OSs because the full path of the executable `neuron-ls` was not used 2. Before this commit, each pytest worker was assigned a single neuron core. This works on multi node tests but fails on single node tests because a neuron core can only be opened by a single process. This commit assigns two different neuron cores to each pytest worker for client-server tests: one for the server and one for the client. Trn1 has 2 cores per neuron device and Trn2 has 8 cores per neuron device, so this assignment works for both. 3. When running in serial mode, the env var PYTEST_XDIST_WORKER is not set, so the NEURON_RT_VISIBLE_CORES env var is also not set. This causes the server to occupy all neuron cores and the client fails. So this commit assigns device 0 to the server and client when running with one worker. Signed-off-by: Sai Sunku <sunkusa@amazon.com> (cherry picked from commit f893f5f)
HewlettPackard · Jan 2, 2025 · 83d127f · 83d127f
1 parent 71bdde7
commit 83d127f
Showing 1 changed file with 20 additions and 13 deletions.
diff --git a/fabtests/pytest/common.py b/fabtests/pytest/common.py
@@ -68,7 +68,7 @@ def num_cuda_devices(ip):
 @functools.lru_cache(10)
 @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
 def num_neuron_devices(ip):
-    proc = run("ssh {} neuron-ls -j".format(ip), shell=True,
+    proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True,
                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                timeout=60, encoding="utf-8")
 
@@ -84,7 +84,7 @@ def num_neuron_devices(ip):
 @functools.lru_cache(10)
 @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
 def num_neuron_cores_on_device(ip, device_id):
-    proc = run("ssh {} neuron-ls -j".format(ip), shell=True,
+    proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True,
                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                timeout=60, encoding="utf-8")
 
@@ -97,7 +97,7 @@ def num_neuron_cores_on_device(ip, device_id):
 
 @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
 def is_neuron_device_available(ip, device_id):
-    proc = run("ssh {} neuron-ls -j".format(ip), shell=True,
+    proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True,
                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                timeout=60, encoding="utf-8")
 
@@ -455,19 +455,26 @@ def prepare_base_command(self, command_type, executable,
         if "PYTEST_XDIST_WORKER" in os.environ:
             worker_id = int(os.environ["PYTEST_XDIST_WORKER"].replace("gw", ""))
             hmem_device_id = worker_id % num_hmem
-            if host_memory_type == "cuda":
-                command += " -i {}".format(hmem_device_id)
-            else:
-                assert host_memory_type == "neuron"
-                num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id)
+        else:
+            hmem_device_id = 0
+
+        if host_memory_type == "cuda":
+            command += " -i {}".format(hmem_device_id)
+        else:
+            assert host_memory_type == "neuron"
+            num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id)
+            if command_type == "server":
                 additional_environment = "NEURON_RT_VISIBLE_CORES={}".format(
                     hmem_device_id * num_cores)
-                wait_until_neuron_device_available(host_ip, hmem_device_id)
+            else:
+                additional_environment = "NEURON_RT_VISIBLE_CORES={}".format(
+                    hmem_device_id * num_cores + 1)
+            wait_until_neuron_device_available(host_ip, hmem_device_id)
 
-            if self._cmdline_args.provider == "efa":
-                import efa.efa_common
-                efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem)
-                command += " -d {}-rdm".format(efa_device)
+        if self._cmdline_args.provider == "efa":
+            import efa.efa_common
+            efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem)
+            command += " -d {}-rdm".format(efa_device)
 
         return command, additional_environment