Merge pull request #16 from okoge-kaz/feature/v1.0.1

version v1.0.1
okoge-kaz · Sep 16, 2024 · 890d9dd · 890d9dd
2 parents e17075a + 3a9da08
commit 890d9dd
Show file tree

Hide file tree

Showing 50 changed files with 3,556 additions and 143 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -47,6 +47,7 @@
     "pbar",
     "peft",
     "plamo",
+    "pretraining",
     "probs",
     "psutil",
     "pubmed",
@@ -57,6 +58,7 @@
     "stabilityai",
     "stablelm",
     "stockmark",
+    "tensorboard",
     "tflops",
     "tobytes",
     "Xformer"

diff --git a/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh b/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-#$ -l rt_AF=2
-#$ -l h_rt=0:01:00:00
+#$ -l rt_AF=1
+#$ -l h_rt=0:08:00:00
 #$ -j y
 #$ -o outputs/instruction/Llama-3-8B/
 #$ -cwd
@@ -82,7 +82,7 @@ mpirun -np $NUM_GPUS \
   -x MASTER_ADDR=$MASTER_ADDR \
   -x MASTER_PORT=$MASTER_PORT \
   -bind-to none \
-  -x PATH \
+  -x NCCL_IB_TIMEOUT=22 \
   -x LD_LIBRARY_PATH \
   -x PATH \
   python examples/finetuning.py \
@@ -102,7 +102,7 @@ mpirun -np $NUM_GPUS \
   --adam-beta1 0.9 \
   --adam-beta2 0.95 \
   --adam-eps 1e-8 \
-  --save-interval 500 \
+  --save-interval 10 \
   --eval-interval 500 \
   --eval-iters 10 \
   --bf16 \

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=1e-5
+MIN_LR=1e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-1/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-1.1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 50000 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=2e-6
+MIN_LR=2e-7
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-1.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 50000 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"