-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathptl_ddp_multi_node.yml
34 lines (33 loc) · 1.21 KB
/
ptl_ddp_multi_node.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
name: ptl-mct-multi-node
image: mosaicml/composer:latest
compute:
gpus: 32 # Number of GPUs to use
cluster: r14z3p1 # Insert cluster name
gpu_type: h100_80gb # Type of GPU to use. We use a100_80gb in our experiments
integrations:
- integration_type: "git_repo"
git_repo: SamRaymond/composer_test
git_branch: main
command: |-
cd composer_test/
mkdir -p /tmp/dataset/
mkdir -p /tmp/dataset/train/
mkdir -p /tmp/dataset/test/
chmod -R 777 /tmp/dataset/test/ /tmp/dataset/train/
pip install -Uq torchvision transformers lightning
# torchrun \
# --nproc_per_node=8 \
# --nnodes=2 \
# --node_rank $NODE_RANK \
# --master_addr $MASTER_ADDR \
# --master_port $MASTER_PORT \
# ptl_ddp_multi_node.py || (echo "Command failed - killing python" && pkill python && exit 1)
# Epoch 9: 100%|██████████| 98/98 [00:01<00:00, 67.41it/s, v_num=0]
torchrun \
--nproc_per_node=8 \
--nnodes=4 \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT \
ptl_ddp_multi_node.py || (echo "Command failed - killing python" && pkill python && exit 1)
# Epoch 9: 100%|██████████| 49/49 [00:00<00:00, 51.58it/s, v_num=0]