Skip to content

Commit

Permalink
fix check CUDA_DEVICE_MAX_CONNECTIONS
Browse files Browse the repository at this point in the history
  • Loading branch information
sallyjunjun committed Dec 2, 2024
1 parent 317f18c commit e0968cf
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions internlm/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,14 @@ def enable_pytorch_expandable_segments():


def check_cuda_env():
max_connections = os.getenv("CUDA_DEVICE_MAX_CONNECTIONS")
assert max_connections is not None, "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!"
assert max_connections == '1', "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, but it should be set to 1!".format(max_connections)
if internlm_accelerator.get_accelerator_backend() == AcceleratorType.GPU:
max_connections = os.getenv("CUDA_DEVICE_MAX_CONNECTIONS")
assert max_connections is not None, "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!"
assert max_connections == '1', "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, but it should be set to 1!".format(max_connections)

avoid_record_streams = os.getenv("TORCH_NCCL_AVOID_RECORD_STREAMS")
assert avoid_record_streams is not None, "Env var TORCH_NCCL_AVOID_RECORD_STREAMS has not been set, please set it to 1!"
assert avoid_record_streams == '1', "Env var TORCH_NCCL_AVOID_RECORD_STREAMS is set to {}, but it should be set to 1!".format(avoid_record_streams)


class DummyProfile:
Expand Down

0 comments on commit e0968cf

Please sign in to comment.