forked from shawwn/llama
-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathsetup_nodes.sh
executable file
·60 lines (45 loc) · 2.48 KB
/
setup_nodes.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash
LAMBDA_CLOUD_KEY="path-to-your-cloud-ssh-key"
HEAD_IP="xxx.xxx.xxx.xxx"
WORKER_IP="xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx"
ALL_IP=( $HEAD_IP "${WORKER_IP[@]}" )
echo "List of nodes: "
for IP in ${ALL_IP[*]}; do
echo $IP
done
echo "Generate ssh keys on the head node ------------------------------"
ssh -i $LAMBDA_CLOUD_KEY ubuntu@$HEAD_IP "ssh-keygen -t rsa -N ''"
echo "Add public key to the all nodes ------------------------------"
for IP in ${ALL_IP[*]}; do
ssh -i $LAMBDA_CLOUD_KEY ubuntu@$HEAD_IP "cat ~/.ssh/id_rsa.pub" | ssh -i $LAMBDA_CLOUD_KEY ubuntu@$IP "cat >> ~/.ssh/authorized_keys"
done
echo "Set NCCL_IB_DISABLE=1 for all nodes ------------------------------"
for IP in ${ALL_IP[*]}; do
ssh -i $LAMBDA_CLOUD_KEY ubuntu@$IP "echo export NCCL_IB_DISABLE=1 >> .bashrc"
ssh -i $LAMBDA_CLOUD_KEY ubuntu@$IP "echo NCCL_IB_DISABLE=1 | sudo tee -a /etc/environment"
done
echo "Let the head node ssh into the all nodes at least once so in the future it won't ask about fingerprint ------------------------------"
for IP in ${ALL_IP[*]}; do
ssh -i $LAMBDA_CLOUD_KEY -t ubuntu@$HEAD_IP "echo exit | xargs ssh ubuntu@$IP"
done
echo "Set up NFS ------------------------------"
for IP in ${ALL_IP[*]}; do
ssh -i $LAMBDA_CLOUD_KEY ubuntu@$IP "if [ ! -d shared ]; then mkdir shared; fi"
done
cat head-nfs-install.sh | sed "s/ [\\]//g" | ssh -i $LAMBDA_CLOUD_KEY ${HEAD_IP}
for IP in ${WORKER_IP[*]}; do
ssh -i $LAMBDA_CLOUD_KEY ubuntu@$HEAD_IP "echo '/home/ubuntu/shared ${IP}(rw,sync,no_subtree_check)' | sudo tee -a /etc/exports"
done
ssh -i $LAMBDA_CLOUD_KEY ubuntu@$HEAD_IP "sudo systemctl restart nfs-kernel-server"
echo "NFS set up on the head node"
for IP in ${WORKER_IP[*]}; do
ssh -i $LAMBDA_CLOUD_KEY ubuntu@$IP "sudo mount ${HEAD_IP}:/home/ubuntu/shared /home/ubuntu/shared"
done
echo "NFS set up on the worker nodes"
echo "Clone repos into NFS ------------------------------"
ssh -i $LAMBDA_CLOUD_KEY ubuntu@$HEAD_IP "if [ ! -d /home/ubuntu/shared/llama ]; then git clone https://github.com/LambdaLabsML/llama.git /home/ubuntu/shared/llama; fi"
ssh -i $LAMBDA_CLOUD_KEY ubuntu@$HEAD_IP "if [ ! -d /home/ubuntu/shared/llama-dl ]; then git clone https://github.com/chuanli11/llama-dl.git /home/ubuntu/shared/llama-dl; fi"
echo "Install LLAMA dependencies (asynchronously) ------------------------------"
for IP in ${ALL_IP[*]}; do cat dependencies-install.sh | ssh -i $LAMBDA_CLOUD_KEY ${IP} & done
wait
echo "All instances are successfully set up 🥳🥳🥳🥳🥳🥳🥳🥳"