-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #16 from okoge-kaz/feature/v1.0.1
version v1.0.1
- Loading branch information
Showing
50 changed files
with
3,556 additions
and
143 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
122 changes: 122 additions & 0 deletions
122
scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
#!/bin/bash | ||
#$ -l rt_AF=8 | ||
#$ -l h_rt=1:00:00:00 | ||
#$ -j y | ||
#$ -o outputs/instruction/Llama-3.1-8B/ | ||
#$ -cwd | ||
|
||
# module load | ||
source /etc/profile.d/modules.sh | ||
module use /bb/llm/gaf51275/modules/modulefiles | ||
|
||
module load cuda/12.1/12.1.1 | ||
module load cudnn/cuda-12.1/9.0.0 | ||
module load nccl/2.20.5 | ||
module load hpcx/2.12 | ||
module load gcc/11.4.0 | ||
|
||
# swich virtual env | ||
source .env/bin/activate | ||
|
||
# distributed settings | ||
export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) | ||
export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) | ||
|
||
echo "MASTER_ADDR=${MASTER_ADDR}" | ||
|
||
# hostfile | ||
|
||
if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then | ||
export NUM_GPU_PER_NODE=4 | ||
NODE_TYPE="v100" | ||
elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then | ||
export NUM_GPU_PER_NODE=8 | ||
NODE_TYPE="a100" | ||
else | ||
echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" | ||
fi | ||
|
||
NUM_NODES=$NHOSTS | ||
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) | ||
|
||
mkdir -p ./hostfile | ||
|
||
HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} | ||
while read -r line; do | ||
echo "${line} slots=${NUM_GPU_PER_NODE}" | ||
done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" | ||
|
||
# training config | ||
SEQ_LENGTH=8192 | ||
DATA_PARALLEL_SIZE=$NUM_GPUS | ||
|
||
MICRO_BATCH_SIZE=1 | ||
GLOBAL_BATCH_SIZE=256 | ||
|
||
# optimizer config | ||
LR=1e-5 | ||
MIN_LR=1e-6 | ||
WEIGHT_DECAY=0.1 | ||
GRAD_CLIP=1 | ||
|
||
# checkpoint | ||
TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct | ||
CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 | ||
CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-1/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" | ||
|
||
mkdir -p ${CHECKPOINT_SAVE_DIR} | ||
|
||
# dataset | ||
DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1 | ||
|
||
TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl | ||
VALID_DATA_PATH=${DATASET_DIR}/train.jsonl | ||
|
||
# job name | ||
JOB_NAME="Llama-3.1-8B-instruct-exp-1.1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" | ||
|
||
# run | ||
mpirun -np $NUM_GPUS \ | ||
--npernode $NUM_GPU_PER_NODE \ | ||
-hostfile $HOSTFILE_NAME \ | ||
-x MASTER_ADDR=$MASTER_ADDR \ | ||
-x MASTER_PORT=$MASTER_PORT \ | ||
-bind-to none \ | ||
-x NCCL_IB_TIMEOUT=22 \ | ||
-x LD_LIBRARY_PATH \ | ||
-x PATH \ | ||
python examples/finetuning.py \ | ||
--seq-length ${SEQ_LENGTH} \ | ||
--micro-batch-size ${MICRO_BATCH_SIZE} \ | ||
--global-batch-size ${GLOBAL_BATCH_SIZE} \ | ||
--hf-transformer-model-dir ${TOKENIZER_DIR} \ | ||
--instruction-train-data-path ${TRAIN_DATA_PATH} \ | ||
--instruction-valid-data-path ${VALID_DATA_PATH} \ | ||
--epoch 2 \ | ||
--lr ${LR} \ | ||
--min-lr ${MIN_LR} \ | ||
--lr-decay-style cosine \ | ||
--weight-decay ${WEIGHT_DECAY} \ | ||
--grad-clip-norm ${GRAD_CLIP} \ | ||
--optimizer adam \ | ||
--adam-beta1 0.9 \ | ||
--adam-beta2 0.95 \ | ||
--adam-eps 1e-8 \ | ||
--save-interval 50000 \ | ||
--eval-interval 500 \ | ||
--eval-iters 10 \ | ||
--bf16 \ | ||
--mixed-precision \ | ||
--base-model ${CHECKPOINT_DIR} \ | ||
--save ${CHECKPOINT_SAVE_DIR} \ | ||
--load ${CHECKPOINT_SAVE_DIR} \ | ||
--low-cpu-fsdp \ | ||
--sharding-strategy FULL_SHARD \ | ||
--checkpoint-type LOCAL_STATE_DICT \ | ||
--fsdp-activation-checkpointing \ | ||
--instruction-tuning \ | ||
--save-sampler-state \ | ||
--use-mpi \ | ||
--wandb-entity "prj-jalm" \ | ||
--wandb-project "Llama-3.1-8B-Instruct" \ | ||
--wandb-name "${JOB_NAME}" |
122 changes: 122 additions & 0 deletions
122
scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
#!/bin/bash | ||
#$ -l rt_AF=8 | ||
#$ -l h_rt=1:00:00:00 | ||
#$ -j y | ||
#$ -o outputs/instruction/Llama-3.1-8B/ | ||
#$ -cwd | ||
|
||
# module load | ||
source /etc/profile.d/modules.sh | ||
module use /bb/llm/gaf51275/modules/modulefiles | ||
|
||
module load cuda/12.1/12.1.1 | ||
module load cudnn/cuda-12.1/9.0.0 | ||
module load nccl/2.20.5 | ||
module load hpcx/2.12 | ||
module load gcc/11.4.0 | ||
|
||
# swich virtual env | ||
source .env/bin/activate | ||
|
||
# distributed settings | ||
export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) | ||
export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) | ||
|
||
echo "MASTER_ADDR=${MASTER_ADDR}" | ||
|
||
# hostfile | ||
|
||
if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then | ||
export NUM_GPU_PER_NODE=4 | ||
NODE_TYPE="v100" | ||
elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then | ||
export NUM_GPU_PER_NODE=8 | ||
NODE_TYPE="a100" | ||
else | ||
echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" | ||
fi | ||
|
||
NUM_NODES=$NHOSTS | ||
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) | ||
|
||
mkdir -p ./hostfile | ||
|
||
HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} | ||
while read -r line; do | ||
echo "${line} slots=${NUM_GPU_PER_NODE}" | ||
done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" | ||
|
||
# training config | ||
SEQ_LENGTH=8192 | ||
DATA_PARALLEL_SIZE=$NUM_GPUS | ||
|
||
MICRO_BATCH_SIZE=1 | ||
GLOBAL_BATCH_SIZE=256 | ||
|
||
# optimizer config | ||
LR=2e-6 | ||
MIN_LR=2e-7 | ||
WEIGHT_DECAY=0.1 | ||
GRAD_CLIP=1 | ||
|
||
# checkpoint | ||
TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct | ||
CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 | ||
CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" | ||
|
||
mkdir -p ${CHECKPOINT_SAVE_DIR} | ||
|
||
# dataset | ||
DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1 | ||
|
||
TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl | ||
VALID_DATA_PATH=${DATASET_DIR}/train.jsonl | ||
|
||
# job name | ||
JOB_NAME="Llama-3.1-8B-instruct-exp-1.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" | ||
|
||
# run | ||
mpirun -np $NUM_GPUS \ | ||
--npernode $NUM_GPU_PER_NODE \ | ||
-hostfile $HOSTFILE_NAME \ | ||
-x MASTER_ADDR=$MASTER_ADDR \ | ||
-x MASTER_PORT=$MASTER_PORT \ | ||
-bind-to none \ | ||
-x NCCL_IB_TIMEOUT=22 \ | ||
-x LD_LIBRARY_PATH \ | ||
-x PATH \ | ||
python examples/finetuning.py \ | ||
--seq-length ${SEQ_LENGTH} \ | ||
--micro-batch-size ${MICRO_BATCH_SIZE} \ | ||
--global-batch-size ${GLOBAL_BATCH_SIZE} \ | ||
--hf-transformer-model-dir ${TOKENIZER_DIR} \ | ||
--instruction-train-data-path ${TRAIN_DATA_PATH} \ | ||
--instruction-valid-data-path ${VALID_DATA_PATH} \ | ||
--epoch 2 \ | ||
--lr ${LR} \ | ||
--min-lr ${MIN_LR} \ | ||
--lr-decay-style cosine \ | ||
--weight-decay ${WEIGHT_DECAY} \ | ||
--grad-clip-norm ${GRAD_CLIP} \ | ||
--optimizer adam \ | ||
--adam-beta1 0.9 \ | ||
--adam-beta2 0.95 \ | ||
--adam-eps 1e-8 \ | ||
--save-interval 50000 \ | ||
--eval-interval 500 \ | ||
--eval-iters 10 \ | ||
--bf16 \ | ||
--mixed-precision \ | ||
--base-model ${CHECKPOINT_DIR} \ | ||
--save ${CHECKPOINT_SAVE_DIR} \ | ||
--load ${CHECKPOINT_SAVE_DIR} \ | ||
--low-cpu-fsdp \ | ||
--sharding-strategy FULL_SHARD \ | ||
--checkpoint-type LOCAL_STATE_DICT \ | ||
--fsdp-activation-checkpointing \ | ||
--instruction-tuning \ | ||
--save-sampler-state \ | ||
--use-mpi \ | ||
--wandb-entity "prj-jalm" \ | ||
--wandb-project "Llama-3.1-8B-Instruct" \ | ||
--wandb-name "${JOB_NAME}" |
Oops, something went wrong.