Skip to content

Commit

Permalink
Merge pull request #16 from okoge-kaz/feature/v1.0.1
Browse files Browse the repository at this point in the history
version v1.0.1
  • Loading branch information
okoge-kaz authored Sep 16, 2024
2 parents e17075a + 3a9da08 commit 890d9dd
Show file tree
Hide file tree
Showing 50 changed files with 3,556 additions and 143 deletions.
2 changes: 2 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"pbar",
"peft",
"plamo",
"pretraining",
"probs",
"psutil",
"pubmed",
Expand All @@ -57,6 +58,7 @@
"stabilityai",
"stablelm",
"stockmark",
"tensorboard",
"tflops",
"tobytes",
"Xformer"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
#$ -l rt_AF=2
#$ -l h_rt=0:01:00:00
#$ -l rt_AF=1
#$ -l h_rt=0:08:00:00
#$ -j y
#$ -o outputs/instruction/Llama-3-8B/
#$ -cwd
Expand Down Expand Up @@ -82,7 +82,7 @@ mpirun -np $NUM_GPUS \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
-bind-to none \
-x PATH \
-x NCCL_IB_TIMEOUT=22 \
-x LD_LIBRARY_PATH \
-x PATH \
python examples/finetuning.py \
Expand All @@ -102,7 +102,7 @@ mpirun -np $NUM_GPUS \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--save-interval 500 \
--save-interval 10 \
--eval-interval 500 \
--eval-iters 10 \
--bf16 \
Expand Down
122 changes: 122 additions & 0 deletions scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/bin/bash
#$ -l rt_AF=8
#$ -l h_rt=1:00:00:00
#$ -j y
#$ -o outputs/instruction/Llama-3.1-8B/
#$ -cwd

# module load
source /etc/profile.d/modules.sh
module use /bb/llm/gaf51275/modules/modulefiles

module load cuda/12.1/12.1.1
module load cudnn/cuda-12.1/9.0.0
module load nccl/2.20.5
module load hpcx/2.12
module load gcc/11.4.0

# swich virtual env
source .env/bin/activate

# distributed settings
export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

# hostfile

if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
export NUM_GPU_PER_NODE=4
NODE_TYPE="v100"
elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
export NUM_GPU_PER_NODE=8
NODE_TYPE="a100"
else
echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
fi

NUM_NODES=$NHOSTS
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))

mkdir -p ./hostfile

HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
while read -r line; do
echo "${line} slots=${NUM_GPU_PER_NODE}"
done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"

# training config
SEQ_LENGTH=8192
DATA_PARALLEL_SIZE=$NUM_GPUS

MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=256

# optimizer config
LR=1e-5
MIN_LR=1e-6
WEIGHT_DECAY=0.1
GRAD_CLIP=1

# checkpoint
TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-1/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"

mkdir -p ${CHECKPOINT_SAVE_DIR}

# dataset
DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1

TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
VALID_DATA_PATH=${DATASET_DIR}/train.jsonl

# job name
JOB_NAME="Llama-3.1-8B-instruct-exp-1.1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"

# run
mpirun -np $NUM_GPUS \
--npernode $NUM_GPU_PER_NODE \
-hostfile $HOSTFILE_NAME \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
-bind-to none \
-x NCCL_IB_TIMEOUT=22 \
-x LD_LIBRARY_PATH \
-x PATH \
python examples/finetuning.py \
--seq-length ${SEQ_LENGTH} \
--micro-batch-size ${MICRO_BATCH_SIZE} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--hf-transformer-model-dir ${TOKENIZER_DIR} \
--instruction-train-data-path ${TRAIN_DATA_PATH} \
--instruction-valid-data-path ${VALID_DATA_PATH} \
--epoch 2 \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--weight-decay ${WEIGHT_DECAY} \
--grad-clip-norm ${GRAD_CLIP} \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--save-interval 50000 \
--eval-interval 500 \
--eval-iters 10 \
--bf16 \
--mixed-precision \
--base-model ${CHECKPOINT_DIR} \
--save ${CHECKPOINT_SAVE_DIR} \
--load ${CHECKPOINT_SAVE_DIR} \
--low-cpu-fsdp \
--sharding-strategy FULL_SHARD \
--checkpoint-type LOCAL_STATE_DICT \
--fsdp-activation-checkpointing \
--instruction-tuning \
--save-sampler-state \
--use-mpi \
--wandb-entity "prj-jalm" \
--wandb-project "Llama-3.1-8B-Instruct" \
--wandb-name "${JOB_NAME}"
122 changes: 122 additions & 0 deletions scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/bin/bash
#$ -l rt_AF=8
#$ -l h_rt=1:00:00:00
#$ -j y
#$ -o outputs/instruction/Llama-3.1-8B/
#$ -cwd

# module load
source /etc/profile.d/modules.sh
module use /bb/llm/gaf51275/modules/modulefiles

module load cuda/12.1/12.1.1
module load cudnn/cuda-12.1/9.0.0
module load nccl/2.20.5
module load hpcx/2.12
module load gcc/11.4.0

# swich virtual env
source .env/bin/activate

# distributed settings
export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

# hostfile

if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
export NUM_GPU_PER_NODE=4
NODE_TYPE="v100"
elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
export NUM_GPU_PER_NODE=8
NODE_TYPE="a100"
else
echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
fi

NUM_NODES=$NHOSTS
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))

mkdir -p ./hostfile

HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
while read -r line; do
echo "${line} slots=${NUM_GPU_PER_NODE}"
done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"

# training config
SEQ_LENGTH=8192
DATA_PARALLEL_SIZE=$NUM_GPUS

MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=256

# optimizer config
LR=2e-6
MIN_LR=2e-7
WEIGHT_DECAY=0.1
GRAD_CLIP=1

# checkpoint
TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"

mkdir -p ${CHECKPOINT_SAVE_DIR}

# dataset
DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1

TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
VALID_DATA_PATH=${DATASET_DIR}/train.jsonl

# job name
JOB_NAME="Llama-3.1-8B-instruct-exp-1.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"

# run
mpirun -np $NUM_GPUS \
--npernode $NUM_GPU_PER_NODE \
-hostfile $HOSTFILE_NAME \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
-bind-to none \
-x NCCL_IB_TIMEOUT=22 \
-x LD_LIBRARY_PATH \
-x PATH \
python examples/finetuning.py \
--seq-length ${SEQ_LENGTH} \
--micro-batch-size ${MICRO_BATCH_SIZE} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--hf-transformer-model-dir ${TOKENIZER_DIR} \
--instruction-train-data-path ${TRAIN_DATA_PATH} \
--instruction-valid-data-path ${VALID_DATA_PATH} \
--epoch 2 \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--weight-decay ${WEIGHT_DECAY} \
--grad-clip-norm ${GRAD_CLIP} \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--save-interval 50000 \
--eval-interval 500 \
--eval-iters 10 \
--bf16 \
--mixed-precision \
--base-model ${CHECKPOINT_DIR} \
--save ${CHECKPOINT_SAVE_DIR} \
--load ${CHECKPOINT_SAVE_DIR} \
--low-cpu-fsdp \
--sharding-strategy FULL_SHARD \
--checkpoint-type LOCAL_STATE_DICT \
--fsdp-activation-checkpointing \
--instruction-tuning \
--save-sampler-state \
--use-mpi \
--wandb-entity "prj-jalm" \
--wandb-project "Llama-3.1-8B-Instruct" \
--wandb-name "${JOB_NAME}"
Loading

0 comments on commit 890d9dd

Please sign in to comment.