diff --git a/.vscode/settings.json b/.vscode/settings.json index bbadf97..19bc179 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -47,6 +47,7 @@ "pbar", "peft", "plamo", + "pretraining", "probs", "psutil", "pubmed", @@ -57,6 +58,7 @@ "stabilityai", "stablelm", "stockmark", + "tensorboard", "tflops", "tobytes", "Xformer" diff --git a/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh b/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh index 4ebf185..7dca54d 100644 --- a/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh +++ b/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh @@ -1,6 +1,6 @@ #!/bin/bash -#$ -l rt_AF=2 -#$ -l h_rt=0:01:00:00 +#$ -l rt_AF=1 +#$ -l h_rt=0:08:00:00 #$ -j y #$ -o outputs/instruction/Llama-3-8B/ #$ -cwd @@ -82,7 +82,7 @@ mpirun -np $NUM_GPUS \ -x MASTER_ADDR=$MASTER_ADDR \ -x MASTER_PORT=$MASTER_PORT \ -bind-to none \ - -x PATH \ + -x NCCL_IB_TIMEOUT=22 \ -x LD_LIBRARY_PATH \ -x PATH \ python examples/finetuning.py \ @@ -102,7 +102,7 @@ mpirun -np $NUM_GPUS \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ - --save-interval 500 \ + --save-interval 10 \ --eval-interval 500 \ --eval-iters 10 \ --bf16 \ diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh new file mode 100644 index 0000000..1d6424d --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-1/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1 + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-1.1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 50000 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh new file mode 100644 index 0000000..4b7df07 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2e-6 +MIN_LR=2e-7 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1 + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-1.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 50000 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-3.sh new file mode 100644 index 0000000..972aec0 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-3.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=1 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-3/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-3 + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-1.3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh new file mode 100644 index 0000000..6e47a3b --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-4/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-4 + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-1.4-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh new file mode 100644 index 0000000..d4f6e7d --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-1/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh new file mode 100644 index 0000000..2109b46 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10+/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted-no-oasst-en-lmsys + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10+-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-1.sh new file mode 100644 index 0000000..cde3bf8 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-1.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted-no-oasst-en-lmsys + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-2.sh new file mode 100644 index 0000000..b2fb469 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-2.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted-no-oasst-en-lmsys + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh new file mode 100644 index 0000000..f7872b6 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:16:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10-3/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-no-oasst-en-lmsys + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh new file mode 100644 index 0000000..63810b5 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=16 +#$ -l h_rt=1:8:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10-4/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-no-oasst-en-lmsys-magpie-ultra + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-4-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-5.sh new file mode 100644 index 0000000..1f30b07 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-5.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=16 +#$ -l h_rt=1:16:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10-5/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-no-oasst-en-lmsys-magpie-ultra-gemma-magpie + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-5-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-13.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-13.sh new file mode 100644 index 0000000..9fd9417 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-13.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=3.5e-5 +MIN_LR=3.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-13/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-13-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-14.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-14.sh new file mode 100644 index 0000000..1549fb1 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-14.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=5e-5 +MIN_LR=5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-14/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-14-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh new file mode 100644 index 0000000..16eb573 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh new file mode 100644 index 0000000..3306610 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=5e-6 +MIN_LR=5e-7 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-3/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh new file mode 100644 index 0000000..d473880 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-6 +MIN_LR=1e-7 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-4/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-4-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh new file mode 100644 index 0000000..6ed7988 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-5/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-5-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh new file mode 100644 index 0000000..21b4b8e --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=16 +#$ -l h_rt=0:20:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-6/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0.5-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-6-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh new file mode 100644 index 0000000..dedc27f --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-7/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25 + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-7-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-8.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-8.sh new file mode 100644 index 0000000..d5b8b7f --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-8.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-8/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-no-oasst + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-8-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-9.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-9.sh new file mode 100644 index 0000000..103eaf1 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-9.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-9/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-en-oasst + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-9-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/index.sh b/scripts/index.sh deleted file mode 100644 index 9871629..0000000 --- a/scripts/index.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -#$ -l rt_F=1 -#$ -l h_rt=1:0:00:00 -#$ -j y -#$ -o outputs/index/ -#$ -cwd - -# swich virtual env -source .env/bin/activate - -python src/llama_recipes/datasets/index.py diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py index 80d872b..e3c38fc 100644 --- a/src/llama_recipes/arguments.py +++ b/src/llama_recipes/arguments.py @@ -10,6 +10,7 @@ def parse_args() -> argparse.Namespace: parser = _add_training_args(parser=parser) parser = _add_regularization_args(parser=parser) parser = _add_instruction_tuning_args(parser=parser) + parser = _add_torch_profiler_args(parser=parser) args = parser.parse_args() @@ -67,6 +68,10 @@ def _add_fsdp_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: group.add_argument( "--use-dist-ckpt", action="store_true" ) + group.add_argument( + '--distributed-timeout-minutes', type=int, default=10, + help='Timeout minutes for torch.distributed.' + ) return parser @@ -204,7 +209,7 @@ def _add_training_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPars # optimizer group.add_argument( '--optimizer', type=str, default='adam', - choices=['adam', 'anyprecision'], + choices=['adam'], help='Optimizer function' ) group.add_argument( @@ -334,5 +339,44 @@ def _add_instruction_tuning_args(parser: argparse.ArgumentParser) -> argparse.Ar group.add_argument( "--save-sampler-state", action="store_true", ) + group.add_argument("--instruct-debug", action="store_true") + + return parser + + +def _add_torch_profiler_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + group = parser.add_argument_group(title='torch profiler') + + group.add_argument('--torch-profile', action='store_true', help='Enable torch profiler') + group.add_argument( + '--torch-profile-ranks', nargs='+', type=int, default=[0], help='Global ranks to profile' + ) + group.add_argument('--torch-profile-wait', type=int, default=0, help='Steps to wait before profiling') + group.add_argument('--torch-profile-warmup', type=int, default=1, help='Warmup steps before profiling') + group.add_argument('--torch-profile-active', type=int, default=1, help='Steps to profile') + group.add_argument( + '--torch-profile-repeat', type=int, default=1, help='Repeat profiling this number of times' + ) + group.add_argument( + '--torch-profile-skip-first', type=int, default=1, + help='Number of iterations to skip before profiling' + ) + group.add_argument( + '--torch-profile-record-shapes', action='store_true', + help='Save information about operator’s input shapes' + ) + group.add_argument( + '--torch-profile-profile-memory', action='store_true', + help='Track tensor memory allocation/deallocation' + ) + group.add_argument( + '--torch-profile-with-stack', action='store_true', + help='Record source information for the ops' + ) + group.add_argument( + '--torch-profile-with-flops', action='store_true', help='Use formula to estimate the FLOPs' + ) + group.add_argument('--torch-profile-with-modules', action='store_true', help='Record module hierarchy ') + group.add_argument('--tensorboard-dir', type=str, default=None) return parser diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py index 786b946..15ee388 100644 --- a/src/llama_recipes/finetuning.py +++ b/src/llama_recipes/finetuning.py @@ -1,6 +1,7 @@ import copy import os import sys +from datetime import timedelta import torch import torch.distributed as torch_distributed @@ -69,7 +70,10 @@ def main() -> None: args.gradient_accumulation_steps = args.global_batch_size // (args.micro_batch_size * world_size) assert args.gradient_accumulation_steps >= 1 - torch_distributed.init_process_group(backend="nccl", world_size=world_size, rank=rank) + timeout = timedelta(minutes=args.distributed_timeout_minutes) + torch_distributed.init_process_group( + backend="nccl", world_size=world_size, rank=rank, timeout=timeout, + ) # wandb setting if args.wandb_name is not None and is_rank_0(): @@ -141,6 +145,9 @@ def main() -> None: model_name=args.base_model, ) + from torch.distributed._tensor.device_mesh import init_device_mesh # type: ignore + device_mesh = init_device_mesh(device_type="cuda", mesh_shape=(world_size, )) + model = FSDP( model, # type: ignore auto_wrap_policy=wrapping_policy, @@ -155,8 +162,12 @@ def main() -> None: ) if args.low_cpu_fsdp and rank != 0 else None, + device_mesh=device_mesh, ) if args.fsdp_activation_checkpointing: + # ref: https://github.com/meta-llama/llama-recipes/blob/778e31e35cfbe385a31b3a94b794e3f75e276d1a/src/llama_recipes/finetuning.py#L193-L195 + # model.enable_input_require_grads() + # model.gradient_checkpointing_enable() apply_fsdp_checkpointing(model=model, model_name=args.base_model) if args.direct_preference_optimization: @@ -255,25 +266,13 @@ def main() -> None: else: raise ValueError("unknown training mode") - if args.bf16 and args.optimizer == "anyprecision": - optimizer = AnyPrecisionAdamW( - model.parameters(), # type: ignore - lr=args.lr, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps, - momentum_dtype=torch.bfloat16, - variance_dtype=torch.bfloat16, - use_kahan_summation=False, - weight_decay=args.weight_decay, - ) - else: - optimizer = optim.AdamW( - model.parameters(), # type: ignore - lr=args.lr, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps, - weight_decay=args.weight_decay, - ) + optimizer = optim.AdamW( + model.parameters(), # type: ignore + lr=args.lr, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + weight_decay=args.weight_decay, + ) if args.load: if args.use_dist_ckpt: diff --git a/src/llama_recipes/get_fsdp.py b/src/llama_recipes/get_fsdp.py index 615dda2..4b7d1ca 100644 --- a/src/llama_recipes/get_fsdp.py +++ b/src/llama_recipes/get_fsdp.py @@ -12,6 +12,9 @@ def get_sharding_strategy() -> ShardingStrategy: elif args.sharding_strategy == "NO_SHARD": return ShardingStrategy.NO_SHARD elif args.sharding_strategy == "HYBRID_SHARD": + # TODO: https://pytorch.org/tutorials/recipes/distributed_device_mesh.html#how-to-use-devicemesh-with-hsdp + # support device mesh + # ref: https://github.com/meta-llama/llama-recipes/blob/778e31e35cfbe385a31b3a94b794e3f75e276d1a/src/llama_recipes/finetuning.py#L160 return ShardingStrategy.HYBRID_SHARD elif args.sharding_strategy == "_HYBRID_SHARD_ZERO2": return ShardingStrategy._HYBRID_SHARD_ZERO2 diff --git a/src/llama_recipes/get_models.py b/src/llama_recipes/get_models.py index 05a9851..d1bdf78 100644 --- a/src/llama_recipes/get_models.py +++ b/src/llama_recipes/get_models.py @@ -34,33 +34,15 @@ def get_model( init_time = time.perf_counter() if "Llama" in model_name or "Swallow" in model_name: - if args.low_cpu_fsdp: - """ - for FSDP, we can save cpu memory by loading pretrained model on rank0 only. - this avoids cpu oom when loading large models like llama 70B, in which case - model alone would consume 2+TB cpu mem (70 * 4 * 8). This will add some communications - overhead. - """ - if is_rank_0(): - model = LlamaForCausalLM.from_pretrained( - model_name, - load_in_8bit=True if args.quantization else None, - device_map="auto" if args.quantization else None, - use_cache=use_cache, - ) - else: - llama_config = LlamaConfig.from_pretrained(model_name) - llama_config.use_cache = use_cache - with torch.device("meta"): - model = LlamaForCausalLM(llama_config) - - else: - model = LlamaForCausalLM.from_pretrained( - model_name, - load_in_8bit=True if args.quantization else None, - device_map="auto" if args.quantization else None, - use_cache=use_cache, - ) + model = LlamaForCausalLM.from_pretrained( + model_name, + load_in_8bit=True if args.quantization else None, + device_map="auto" if args.quantization else None, + use_cache=use_cache, + max_position_embeddings=args.seq_length, + attn_implementation="flash_attention_2", + torch_dtype=torch.bfloat16 if args.bf16 else torch.float16, + ) elif "Mistral" in model_name or "mistral" in model_name or "Codestral" in model_name: # If using torch.device("meta"), FSDP training hang diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py index 1cb73aa..446e384 100644 --- a/src/llama_recipes/utils/instruction_tuning.py +++ b/src/llama_recipes/utils/instruction_tuning.py @@ -22,8 +22,9 @@ def __init__( args = get_args() self.data_path: str = data_path - self.max_words: int = args.seq_length + self.max_tokens: int = args.seq_length self.tokenizer = tokenizer + self.debug_mode = args.instruct_debug # system prompt self.system_prompt_role = args.system_prompt_role @@ -64,40 +65,51 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: print(f"index={index}, offset={offset}, line={line}, error={e}") exit(1) - SYSTEM_PROMPT: list[dict[str, str]] = [ - { - "role": self.system_prompt_role, - "content": self.system_prompt_content, - } - ] - # chat template - prompt = self.tokenizer.apply_chat_template( - conversation=SYSTEM_PROMPT + conversations["input"], # type: ignore - add_generation_prompt=True, - tokenize=True, - ) - - example = self.tokenizer.apply_chat_template( - conversation=SYSTEM_PROMPT + conversations["input"] + [ # type: ignore - {"role": "assistant", "content": conversations["output"]} - ], - tokenize=True, - ) - tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64) - - if len(example) > self.max_words: + eod_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0] + + if "role" in conversations and conversations["role"] == "next_token_prediction": + prompt = [self.tokenizer.bos_token_id] + example = self.tokenizer.encode( + conversations["content"], add_special_tokens=True # type: ignore # text + + ) + example += [eod_token_id] + tensor_example = torch.tensor(example, dtype=torch.int64) + else: + SYSTEM_PROMPT: list[dict[str, str]] = [ + { + "role": self.system_prompt_role, + "content": self.system_prompt_content, + } + ] + # chat template + prompt = self.tokenizer.apply_chat_template( + conversation=SYSTEM_PROMPT + conversations["input"], # type: ignore + tokenize=True, + ) + + example = self.tokenizer.apply_chat_template( + conversation=SYSTEM_PROMPT + conversations["input"] + [conversations["output"]], # type: ignore + tokenize=True, + ) + tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64) + + if self.debug_mode: + print( + f"prompt: {self.tokenizer.decode(prompt, skip_special_tokens=False)}\n\nexample: {self.tokenizer.decode(example, skip_special_tokens=False)}\n\n", + flush=True, + ) + + if len(example) > self.max_tokens: print(f"\n\nWARNING: example={self.tokenizer.decode(example)}\n\n") - padding_length: int = self.max_words - len(example) - eos_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0] - pad_token_id = eos_token_id + padding_length: int = self.max_tokens - len(example) + pad_token_id: int = self.tokenizer.pad_token_id # type: ignore + assert pad_token_id is not None if padding_length > 0: - pad_tensor = torch.full( - (padding_length,), pad_token_id, dtype=torch.int64 - ) + pad_tensor = torch.full((padding_length,), pad_token_id, dtype=torch.int64) tensor_example = torch.cat((tensor_example, pad_tensor)) elif padding_length < 0: - tensor_example = tensor_example[: self.max_words] + tensor_example = tensor_example[: self.max_tokens] labels = copy.deepcopy(tensor_example) # promptの長さ分だけ -1 で埋める -> 損失関数で無視するようになる @@ -114,6 +126,36 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: # mask out pad token attention_mask = (tensor_example != pad_token_id).float() + # assert + if self.debug_mode: + # padding + pad_ignore_count = torch.sum((tensor_example == pad_token_id) & (labels == IGNORE_INDEX)).item() + assert ( + pad_ignore_count == padding_length + ), f"Number of IGNORE_INDEX due to padding ({pad_ignore_count}) does not match padding_length ({padding_length})" + + # prompt + non_pad_ignore_count = torch.sum( + (tensor_example != pad_token_id) & (labels == IGNORE_INDEX)).item() + assert non_pad_ignore_count == len( + prompt + ), f"Number of IGNORE_INDEX not due to padding ({non_pad_ignore_count}) does not match prompt length ({len(prompt)})" + + # labels' non ignore index + if "output" in conversations: + non_ignore_labels = labels[labels != IGNORE_INDEX] + + chat_template = [conversations["output"]] + expected_tokens = self.tokenizer.apply_chat_template( + chat_template, return_tensors="pt", tokenize=True # type: ignore + ).squeeze() # type: ignore + if expected_tokens[0] == self.tokenizer.bos_token_id: + expected_tokens = expected_tokens[1:] + + assert torch.all( + non_ignore_labels == expected_tokens + ), "Non-ignored labels do not match the tokenized last assistant message" + return { "input_ids": tensor_example, "labels": labels, diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py index cff3b2d..b6c13c7 100644 --- a/src/llama_recipes/utils/train_utils.py +++ b/src/llama_recipes/utils/train_utils.py @@ -1,5 +1,6 @@ import os import time +import sys import torch import torch.cuda.nccl as nccl @@ -82,10 +83,38 @@ def train( # skip batch if args.instruction_tuning or args.direct_preference_optimization: assert args.continual_pretraining is False - print_rank_0(f"Skipping {iteration} batches") - for _ in range(iteration): + print_rank_0(f"Skipping {iteration} iterations") + for _ in range(iteration * gradient_accumulation_steps): next(train_dataloader) + # profile + torch_profile_on = args.torch_profile and ( + torch_distributed.get_rank() in args.torch_profile_ranks + ) + if torch_profile_on: + profiler_context = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + schedule=torch.profiler.schedule( + wait=args.torch_profile_wait, + warmup=args.torch_profile_warmup, + active=args.torch_profile_active, + repeat=args.torch_profile_repeat, + skip_first=args.torch_profile_skip_first, + ), + on_trace_ready=torch.profiler.tensorboard_trace_handler( + args.tensorboard_dir, use_gzip=False + ), + record_shapes=args.torch_profile_record_shapes, + profile_memory=args.torch_profile_profile_memory, + with_stack=args.torch_profile_with_stack, + with_flops=args.torch_profile_with_flops, + with_modules=args.torch_profile_with_modules, + ) + prof = profiler_context.__enter__() + while iteration < args.train_iters: iteration_start_time = time.perf_counter() @@ -242,6 +271,12 @@ def train( iteration=iteration, ) + # pytorch profiler + if torch_profile_on: + prof.step() + + if torch_profile_on: + profiler_context.__exit__(*sys.exc_info()) torch_distributed.barrier() save_checkpoint( model=model, # type: ignore diff --git a/tools/checkpoint-convert/convert_ckpt.py b/tools/checkpoint-convert/convert_ckpt.py index e8dcb32..be48a4c 100644 --- a/tools/checkpoint-convert/convert_ckpt.py +++ b/tools/checkpoint-convert/convert_ckpt.py @@ -1,35 +1,42 @@ import argparse import torch -from transformers import AutoModelForCausalLM +from transformers import AutoModelForCausalLM, AutoTokenizer def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( - "--model", type=str, required=True, help="HuggingFace transformers model name" + "--hf-base-model-checkpoint-path", type=str, + required=True, help="HuggingFace transformers model name" + ) + parser.add_argument("--hf-tokenizer-path", type=str, required=True) + parser.add_argument( + "--pytorch-model-checkpoint-path", type=str, + required=True, help="Path to checkpoint (`model.pth`)" ) - parser.add_argument("--ckpt", type=str, required=True, help="Path to checkpoint (`model.pth`)") parser.add_argument("--out", type=str, required=True, help="Path to output directory") parser.add_argument("--sequence-length", type=int, required=True) args = parser.parse_args() - print(f"Loading HF model: {args.model}", flush=True) + print(f"Loading HF model: {args.hf_base_model_checkpoint_path}", flush=True) model = AutoModelForCausalLM.from_pretrained( - args.model, + args.hf_base_model_checkpoint_path, torch_dtype=torch.bfloat16, trust_remote_code=True, max_position_embeddings=args.sequence_length, ) + tokenizer = AutoTokenizer.from_pretrained(args.hf_tokenizer_path) - print(f"Loading CKPT: {args.ckpt}", flush=True) - state_dict = torch.load(args.ckpt, map_location="cpu") + print(f"Loading CKPT: {args.pytorch_model_checkpoint_path}", flush=True) + state_dict = torch.load(args.pytorch_model_checkpoint_path, map_location="cpu") print("Loading state dict into HF model", flush=True) model.load_state_dict(state_dict) print("Saving HF model", flush=True) model.save_pretrained(args.out, safe_serialization=True) + tokenizer.save_pretrained(args.out) if __name__ == "__main__": diff --git a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh index e54860b..586dc12 100644 --- a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh +++ b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh @@ -1,5 +1,5 @@ #!/bin/bash -#$ -l rt_F=1 +#$ -l rt_AF=1 #$ -l h_rt=1:00:00 #$ -j y #$ -o outputs/convert/ckpt/ @@ -16,36 +16,72 @@ module load hpcx/2.12 module load gcc/11.4.0 set -e +export HF_HOME="/groups/gag51395/.cache/huggigface" # swich virtual env source .env/bin/activate -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-1/LR_2.5e-5_MINLR_2.5e-6_WD_0.1_GC_1 +LATEST_ITERATION=$(cat ${CHECKPOINT_DIR}/latest_iteration.txt) -echo "MASTER_ADDR=${MASTER_ADDR}" +echo "LATEST_ITERATION=${LATEST_ITERATION}" -start=578 -end=578 -increment=5000 +BASE_MODEL_CHECKPOINT=/bb/llm/gaf51275/hf-checkpoints/Meta-Llama-3.1-8B-Instruct +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +OUTPUT_DIR=/bb/llm/gaf51275/2024/checkpoints/pytorch-to-hf/Llama-3.1-8B-Instruct/ +EXTRACTED_PATH=$(echo $CHECKPOINT_DIR | awk -F'/Llama-3.1-8B-Instruct/' '{print $2}') +OUTPUT_DIR="${OUTPUT_DIR}${EXTRACTED_PATH}" -for ((i = start; i <= end; i += increment)); do - ITERATION=$i - FORMATTED_ITERATION=$(printf "iter_%07d" $ITERATION) +echo "convert ${CHECKPOINT_DIR} to ${OUTPUT_DIR}" +mkdir -p $OUTPUT_DIR - CHECK_POINT_PATH=/bb/llm/gaf51275/llama/checkpoints/Swallow-70b-VE-chat/oasst2-top1-imitation-2-3-lr_1e-5-minlr_1e-6-GB_256/${FORMATTED_ITERATION}/model.pt - OUTPUT_PATH=/bb/llm/gaf51275/llama/converted-hf-checkpoint/Swallow-70b-VE-chat/oasst2-top1-imitation-2-3-lr_1e-5-minlr_1e-6-GB_256/${FORMATTED_ITERATION} +ITERATION=$LATEST_ITERATION +FORMATTED_ITERATION=$(printf "iter_%07d" $ITERATION) - echo "convert ${CHECK_POINT_PATH} to ${OUTPUT_PATH}" +CHECK_POINT_PATH=${CHECKPOINT_DIR}/${FORMATTED_ITERATION}/model.pt +OUTPUT_PATH=${OUTPUT_DIR}/${FORMATTED_ITERATION} - mkdir -p $OUTPUT_PATH +echo "convert ${CHECK_POINT_PATH} to ${OUTPUT_PATH}" - BASE_MODEL_CHECKPOINT=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf +mkdir -p $OUTPUT_PATH - python tools/checkpoint-convert/convert_ckpt.py \ - --model $BASE_MODEL_CHECKPOINT \ - --ckpt $CHECK_POINT_PATH \ - --out $OUTPUT_PATH \ - --sequence-length 4096 -done +# convert +python tools/checkpoint-convert/convert_ckpt.py \ + --hf-base-model-checkpoint-path $BASE_MODEL_CHECKPOINT \ + --hf-tokenizer-path $TOKENIZER_DIR \ + --pytorch-model-checkpoint-path $CHECK_POINT_PATH \ + --out $OUTPUT_PATH \ + --sequence-length 8192 + +# upload +upload_checkpoint() { + local upload_dir=$1 + local repo_name=$2 + local max_retries=5 + local retry_count=0 + + while [ $retry_count -lt $max_retries ]; do + if python scripts/abci/upload/upload.py \ + --ckpt-path "$upload_dir" \ + --repo-name "$repo_name"; then + echo "Successfully uploaded $repo_name" + return 0 + else + echo "Upload failed for $repo_name. Retrying..." + ((retry_count++)) + sleep 5 + fi + done + + echo "Failed to upload $repo_name after $max_retries attempts" + return 1 +} + +EXP_NAME=$(echo $EXTRACTED_PATH | sed 's/\//-/g') +HF_REPO_NAME="tokyotech-llm/Llama-3.1-8B-Instruct-${EXP_NAME}-${FORMATTED_ITERATION}" + +echo "upload ${OUTPUT_PATH} to ${HF_REPO_NAME}" + +if ! upload_checkpoint "$OUTPUT_PATH" "$HF_REPO_NAME"; then + echo "Skipping to next checkpoint after repeated failures for $HF_REPO_NAME" +fi diff --git a/tools/dataset/converter/convert_conversation.py b/tools/dataset/converter/convert_conversation.py new file mode 100644 index 0000000..5ca72f8 --- /dev/null +++ b/tools/dataset/converter/convert_conversation.py @@ -0,0 +1,40 @@ +import argparse +import json +import sys + + +def process_jsonl(input_file, output_file): + with open(input_file, "r") as infile, open(output_file, "w") as outfile: + for line in infile: + data = json.loads(line) + + conversations = data.get("conversations", []) + assert len(conversations) >= 2 + + input_data = conversations[:-1] + output_data = conversations[-1] + + data["input"] = input_data + data["output"] = output_data + + json.dump(data, outfile) + outfile.write("\n") + + +def main(): + parser = argparse.ArgumentParser(description="Process JSONL data") + parser.add_argument("--input", help="Input JSONL file") + parser.add_argument("--output", help="Output JSONL file") + + args = parser.parse_args() + + try: + process_jsonl(args.input, args.output) + print(f"Processing complete. Output written to {args.output}") + except Exception as e: + print(f"An error occurred: {str(e)}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tools/dataset/convert_dataset_dpo.py b/tools/dataset/converter/convert_dataset_dpo.py similarity index 100% rename from tools/dataset/convert_dataset_dpo.py rename to tools/dataset/converter/convert_dataset_dpo.py diff --git a/tools/dataset/convert_dataset_instruct.py b/tools/dataset/converter/convert_dataset_instruct.py similarity index 100% rename from tools/dataset/convert_dataset_instruct.py rename to tools/dataset/converter/convert_dataset_instruct.py diff --git a/tools/dataset/converter/convert_magpie_ultra.py b/tools/dataset/converter/convert_magpie_ultra.py new file mode 100644 index 0000000..8e64b48 --- /dev/null +++ b/tools/dataset/converter/convert_magpie_ultra.py @@ -0,0 +1,85 @@ +import argparse +import json +import sys + + +def process_input(input_file): + data = [] + with open(input_file, "r", encoding="utf-8") as f: + for line in f: + try: + item = json.loads(line.strip()) + data.append(item) + except json.JSONDecodeError as e: + print(f"Warning: Skipping invalid JSON line: {line.strip()}", file=sys.stderr) + return data + + +def convert_to_output(input_data, include_english: bool = False): + output_data = [] + for item in input_data: + if item.get("quality") in ["average", "good", "excellent"]: + output_item = { + "input": [ + { + "role": "user", + "content": item["processed_translated_instruction"] + } + ], + "output": { + "role": "assistant", + "content": item["processed_translated_response"] + }, + "quality": item["quality"], + "primary_tag": item["primary_tag"], + } + output_data.append(output_item) + if include_english: + en_output_item = { + "input": [ + { + "role": "user", + "content": item["instruction"], + } + ], + "output": { + "role": "assistant", + "content": item["response"] + }, + "quality": item["quality"], + "primary_tag": item["primary_tag"], + } + output_data.append(en_output_item) + + return output_data + + +def save_output(output_data, output_file): + with open(output_file, "w", encoding="utf-8") as f: + for item in output_data: + json.dump(item, f, ensure_ascii=False) + f.write("\n") + + +def main(): + parser = argparse.ArgumentParser(description="Convert input JSONL to output JSONL") + parser.add_argument("--input", required=True, help="Input JSONL file path") + parser.add_argument("--output", required=True, help="Output JSONL file path") + parser.add_argument("--include-english", action="store_true") + + args = parser.parse_args() + + try: + input_data = process_input(args.input) + output_data = convert_to_output( + input_data=input_data, include_english=args.include_english + ) + save_output(output_data, args.output) + print(f"Conversion completed. Output saved to {args.output}") + except Exception as e: + print(f"An error occurred: {str(e)}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tools/dataset/debug_chat_template.py b/tools/dataset/debug/debug_chat_template.py similarity index 98% rename from tools/dataset/debug_chat_template.py rename to tools/dataset/debug/debug_chat_template.py index 025b71e..20a79d1 100644 --- a/tools/dataset/debug_chat_template.py +++ b/tools/dataset/debug/debug_chat_template.py @@ -45,7 +45,7 @@ } ] -chat_template: str = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" +chat_template: str = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" # noqa: print("before apply chat template") diff --git a/tools/dataset/debug_instruction.py b/tools/dataset/debug/debug_instruction.py similarity index 100% rename from tools/dataset/debug_instruction.py rename to tools/dataset/debug/debug_instruction.py diff --git a/tools/dataset/extract_jsonl.py b/tools/dataset/extract_jsonl.py new file mode 100644 index 0000000..164be17 --- /dev/null +++ b/tools/dataset/extract_jsonl.py @@ -0,0 +1,65 @@ +import argparse +import json +import random +from pathlib import Path + + +def count_lines(file_path): + with open(file_path, "r", encoding="utf-8") as f: + return sum(1 for _ in f) + + +def extract_random_lines(input_path, output_path, num_lines): + total_lines = count_lines(input_path) + + if num_lines >= total_lines: + print( + f"Warning: Requested {num_lines} lines, but file only contains {total_lines} lines. Extracting all lines." + ) + num_lines = total_lines + + selected_indices = set(random.sample(range(total_lines), num_lines)) + + with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile: + for i, line in enumerate(infile): + if i in selected_indices: + try: + # Verify that the line is valid JSON + json.loads(line.strip()) + outfile.write(line) + except json.JSONDecodeError: + print(f"Warning: Invalid JSON on line {i+1}. Skipping.") + selected_indices.remove(i) + if not selected_indices: + break + + +def main(): + parser = argparse.ArgumentParser(description="Extract specified number of random lines from a JSONL file.") + parser.add_argument("--input-path", required=True, help="Path to the input JSONL file") + parser.add_argument("--output-path", required=True, help="Path to the output JSONL file") + parser.add_argument("--num-lines", type=int, required=True, help="Number of lines to extract") + parser.add_argument("--seed", type=int, help="Random seed for reproducibility") + + args = parser.parse_args() + + input_path = Path(args.input_path) + output_path = Path(args.output_path) + + if not input_path.exists(): + print(f"Error: Input file '{input_path}' does not exist.") + return + + if not input_path.is_file(): + print(f"Error: '{input_path}' is not a file.") + return + + if args.seed is not None: + random.seed(args.seed) + + extract_random_lines(input_path, output_path, args.num_lines) + print(f"Extracted {args.num_lines} random lines from '{input_path}' to '{output_path}'.") + + +if __name__ == "__main__": + main() diff --git a/tools/dataset/fileter.py b/tools/dataset/fileter.py new file mode 100644 index 0000000..428722b --- /dev/null +++ b/tools/dataset/fileter.py @@ -0,0 +1,46 @@ +import argparse +import json +from typing import List, Dict + + +def process_jsonl(file_path: str, threshold: float) -> List[Dict]: + filtered_data = [] + with open(file_path, "r") as file: + for line in file: + entry = json.loads(line) + if "overall" not in entry["scores"]: + continue + + if entry["scores"]["overall"] >= threshold: + conversations = entry["conversations"] + # Get all messages except the last assistant message + input_messages = conversations[:-1] + assert len(conversations) % 2 == 0 + # Get only the last assistant message + output_message = conversations[-1] + assert output_message["role"] == "assistant" + assert type(output_message) is dict + filtered_data.append({"input": input_messages, "output": output_message}) + return filtered_data + + +def main(): + parser = argparse.ArgumentParser(description="Filter JSONL file based on score threshold") + parser.add_argument("--input_file", type=str, help="Path to input JSONL file") + parser.add_argument("--output_file", type=str, help="Path to output JSONL file") + parser.add_argument("--threshold", type=int, default=4, help="Score threshold for filtering (default: 0.0)") + + args = parser.parse_args() + + filtered_data = process_jsonl(args.input_file, args.threshold) + + with open(args.output_file, "w", encoding="utf-8") as outfile: + for entry in filtered_data: + json.dump(entry, outfile, ensure_ascii=False) + outfile.write("\n") + + print(f"Processed data has been written to {args.output_file}") + + +if __name__ == "__main__": + main() diff --git a/tools/dataset/filter/filter_gemma_magpie.py b/tools/dataset/filter/filter_gemma_magpie.py new file mode 100644 index 0000000..9a48d99 --- /dev/null +++ b/tools/dataset/filter/filter_gemma_magpie.py @@ -0,0 +1,109 @@ +import argparse +import json +import sys +import random +import re + + +def is_empty_or_template(content): + content = content.strip() + return content in ("", "\n", "\n\n") or content in ("回答例:", "回答例;", "解答例:", "解答例;") + + +def clean_content_start(content): + # Remove leading ">\n\n" or ">\n\n\n" + content = re.sub(r"^>\n\n+", "", content) + # Remove leading asterisks + content = re.sub(r"^\s*\*+\s*", "", content) + return content + + +def clean_content_end(content): + # Remove leading newlines and spaces + content = content.lstrip("\n ") + + # Process the end of the content + lines = content.splitlines() + if lines: + # Clean the last line + last_line = lines[-1].rstrip() + # Remove trailing "**" if present + last_line = re.sub(r"\*+\s*$", "", last_line) + lines[-1] = last_line + + # Join the lines back together + content = "\n".join(lines) + + # Remove trailing asterisks followed by newline + content = re.sub(r"\*+\s*\n$", "\n", content) + + # Ensure the content ends with exactly one newline + content = content.rstrip() + "\n" + + return content + + +def process_jsonl(input_file, output_file): + processed_data = [] + seen_contents = set() + with open(input_file, "r") as infile: + for line in infile: + try: + data = json.loads(line) + + # Transform input + if "input" in data: + data["input"] = [data["input"]] + + # Clean input and output content + input_content = clean_content_end(clean_content_start(data["input"][0].get("content", ""))) + output_content = clean_content_end(clean_content_start(data.get("output", {}).get("content", ""))) + + # Check for empty or template content + if is_empty_or_template(input_content) or is_empty_or_template(output_content): + continue + + # Check for duplicates + content_pair = (input_content, output_content) + if content_pair in seen_contents: + continue + seen_contents.add(content_pair) + + # Update cleaned contents + data["input"][0]["content"] = input_content + data["output"]["content"] = output_content + + # add text section + data["text"] = "user: " + input_content + "\n" + "assistant: " + output_content + + processed_data.append(data) + + except json.JSONDecodeError: + print(f"Error decoding JSON: {line}", file=sys.stderr) + + # Shuffle the processed data + random.shuffle(processed_data) + + # Write the shuffled data to the output file + with open(output_file, "w") as outfile: + for data in processed_data: + json.dump(data, outfile) + outfile.write("\n") + + +def main(): + parser = argparse.ArgumentParser(description="Process and shuffle JSONL files") + parser.add_argument("--input", help="Input JSONL file") + parser.add_argument("--output", help="Output JSONL file") + parser.add_argument("--seed", type=int, help="Random seed for shuffling", default=123) + + args = parser.parse_args() + + if args.seed is not None: + random.seed(args.seed) + + process_jsonl(args.input, args.output) + + +if __name__ == "__main__": + main() diff --git a/tools/dataset/lmsys_dataset.py b/tools/dataset/lmsys_dataset.py new file mode 100644 index 0000000..7c7cb27 --- /dev/null +++ b/tools/dataset/lmsys_dataset.py @@ -0,0 +1,91 @@ +import argparse +import json +import hashlib +from typing import Any + + +def process_sample(sample: dict[str, Any]) -> dict[str, Any] | None: + conversation = sample.get("conversation", []) + if len(conversation) < 2: + return None + + user_message = conversation[0] + assistant_message = conversation[1] + + if not user_message.get("content") or not assistant_message.get("content"): + return None + + result = { + "input": [{"role": "user", "content": user_message["content"]}], + "output": {"role": "assistant", "content": assistant_message["content"]}, + "conversation": sample, + "redacted": "NAME_" in user_message["content"] or "NAME_" in assistant_message["content"], + "text": "user: " + user_message["content"] + "\n\nassistant: " + assistant_message["content"] + } + + return result + + +def hash_sample(sample: dict[str, Any]) -> str: + return hashlib.md5(json.dumps(sample, sort_keys=True).encode()).hexdigest() + + +def main(input_file: str, output_file: str, include_redacted: bool): + with open(input_file, "r", encoding="utf-8") as f: + data = [json.loads(line) for line in f] + + processed_samples = [] + hash_set = set() + invalid_count = 0 + redacted_count = 0 + non_redacted_count = 0 + + for sample in data: + processed = process_sample(sample) + if processed: + if processed["redacted"]: + redacted_count += 1 + if include_redacted: + sample_hash = hash_sample(processed) + if sample_hash not in hash_set: + hash_set.add(sample_hash) + processed_samples.append(processed) + else: + print(f"Duplicate redacted sample found: {sample}") + else: + non_redacted_count += 1 + sample_hash = hash_sample(processed) + if sample_hash not in hash_set: + hash_set.add(sample_hash) + processed_samples.append(processed) + else: + print(f"Duplicate non-redacted sample found: {sample}") + else: + print(f"Invalid sample: {sample}") + invalid_count += 1 + + with open(output_file, "w", encoding="utf-8") as f: + for sample in processed_samples: + json.dump(sample, f, ensure_ascii=False) + f.write("\n") + + print(f"Processed {len(processed_samples)} unique samples.") + print(f"Found {invalid_count} invalid samples.") + print(f"Total samples: {len(data)}") + print(f"Unique non-redacted samples: {non_redacted_count}") + print(f"Redacted samples: {redacted_count}") + if include_redacted: + print("Redacted samples included in output") + else: + print("Redacted samples not included in output") + print(f"Output written to {output_file}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert JSON to specified JSONL format") + parser.add_argument("--input-file", required=True, help="Input JSON file path") + parser.add_argument("--output-file", required=True, help="Output JSONL file path") + parser.add_argument("--include-redacted", action="store_true", help="Include redacted samples in output") + args = parser.parse_args() + + main(args.input_file, args.output_file, args.include_redacted) diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index 837c86c..b02410b 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -1,17 +1,142 @@ #!/bin/bash -INPUT_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/formatted -OUTPUT_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/imitation_2_oasst2_top1 +set -e + +# Control variables +INCLUDE_REDACTED=false +FILTERD_SCORE=0 +NEXT_TOKEN_PERCENT=0 +USE_OPEN_ASSISTANT=false +USE_ONLY_ENGLISH_OPEN_ASSISTANT=false +USE_ENGLISH_LMSYS=true +USE_MAGPIE_ULTRA=true +USE_GEMMA_MAGPIE=true +CUSTOM_OUTPUT_DIR="" + +# Base output directory +BASE_OUTPUT_DIR="/bb/llm/gaf51275/datasets/raw/instruct/training" + +if [ -z "$CUSTOM_OUTPUT_DIR" ]; then + OUTPUT_DIR="$BASE_OUTPUT_DIR/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT" +else + OUTPUT_DIR="$CUSTOM_OUTPUT_DIR" +fi + +if $INCLUDE_REDACTED; then + OUTPUT_DIR="${OUTPUT_DIR}-redacted" +fi + +if ! $USE_OPEN_ASSISTANT; then + OUTPUT_DIR="${OUTPUT_DIR}-no-oasst" +elif $USE_ONLY_ENGLISH_OPEN_ASSISTANT; then + OUTPUT_DIR="${OUTPUT_DIR}-en-oasst" +fi + +if $USE_ENGLISH_LMSYS; then + OUTPUT_DIR="${OUTPUT_DIR}-en-lmsys" +fi + +if $USE_MAGPIE_ULTRA; then + OUTPUT_DIR="${OUTPUT_DIR}-magpie-ultra" +fi + +if $USE_GEMMA_MAGPIE; then + OUTPUT_DIR="${OUTPUT_DIR}-gemma-magpie" +fi mkdir -p $OUTPUT_DIR -cat $INPUT_DIR/oasst1-21k-ja-mixtral-imitation_2.jsonl $INPUT_DIR/oasst2-top1-en.jsonl > $OUTPUT_DIR/merged.jsonl +if $USE_OPEN_ASSISTANT; then + if $USE_ONLY_ENGLISH_OPEN_ASSISTANT; then + FILES=( + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl" + ) + else + FILES=( + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_1.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl" + ) + fi + + MERGED_FILE=$OUTPUT_DIR/merged_oasst.jsonl + + for FILE in "${FILES[@]}"; do + cat $FILE >> $MERGED_FILE + done + + # filter + python tools/dataset/fileter.py \ + --input_file $MERGED_FILE \ + --output_file $OUTPUT_DIR/train.jsonl \ + --threshold $FILTERD_SCORE + + rm $MERGED_FILE + + echo "Filtered open assistant data:" + wc -l $OUTPUT_DIR/train.jsonl +else + # Open Assistant データを使用しない場合は空のファイルを作成 + touch $OUTPUT_DIR/train.jsonl + echo "Skipped Open Assistant data processing." +fi + +# 日本語のLMSYSデータを常に使用 +if $INCLUDE_REDACTED; then + JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train.jsonl +else + JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-ja-no-redacted.jsonl +fi + +cat $JA_LMSYS_FILE >> $OUTPUT_DIR/train.jsonl + +# 英語のLMSYSデータを追加でオプションとして使用 +if $USE_ENGLISH_LMSYS; then + if $INCLUDE_REDACTED; then + EN_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-en.jsonl + else + EN_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-en-no-redacted.jsonl + fi + cat $EN_LMSYS_FILE >> $OUTPUT_DIR/train.jsonl + echo "Added English LMSYS data" +fi + +# Add magpie-ultra dataset processing +if $USE_MAGPIE_ULTRA; then + MAGPIE_ULTRA_FILE=/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/train.jsonl + cat $MAGPIE_ULTRA_FILE >> $OUTPUT_DIR/train.jsonl + echo "Added magpie-ultra data" +fi + +# Add gemma-magpie dataset processing +if $USE_GEMMA_MAGPIE; then + GEMMA_MAGPIE_FILE=/bb/llm/gaf51275/datasets/raw/instruct/MAGPIE/gemma2-27b-it/format.jsonl + cat $GEMMA_MAGPIE_FILE >> $OUTPUT_DIR/train.jsonl + echo "Added gemma-magpie data" +fi + +INSTRUCTION_SAMPLES=$(wc -l $OUTPUT_DIR/train.jsonl | awk '{print $1}') +NEXT_TOKEN_SAMPLES=$(echo "($INSTRUCTION_SAMPLES / (1 - $NEXT_TOKEN_PERCENT)) * $NEXT_TOKEN_PERCENT / 1" | bc) + +python tools/dataset/extract_jsonl.py \ + --input-path /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k/format/merged.jsonl \ + --output-path $OUTPUT_DIR/next-token.jsonl \ + --num-lines $NEXT_TOKEN_SAMPLES \ + --seed 1234 + +echo "Next token data:" +wc -l $OUTPUT_DIR/next-token.jsonl + +cat $OUTPUT_DIR/next-token.jsonl >> $OUTPUT_DIR/train.jsonl + +echo "Total data:" +wc -l $OUTPUT_DIR/train.jsonl -echo "Merged dataset is saved at $OUTPUT_DIR/merged.jsonl" +rm $OUTPUT_DIR/next-token.jsonl -# swich virtual env -source .env/bin/activate +# indexing -python tools/dataset/shuffle_and_split.py \ - --input $OUTPUT_DIR/merged.jsonl \ - --output $OUTPUT_DIR +python tools/pre-process/index_dataset.py \ + --data-file-path $OUTPUT_DIR/train.jsonl diff --git a/tools/dataset/merge_gemma_magpie.sh b/tools/dataset/merge_gemma_magpie.sh new file mode 100644 index 0000000..afd6d5f --- /dev/null +++ b/tools/dataset/merge_gemma_magpie.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/MAGPIE/gemma2-27b-it + +mkdir -p $OUTPUT_DIR +rm $OUTPUT_DIR/merged.jsonl + +FILES=$(find /bb/llm/gaf51275/datasets/raw/instruct/MAGPIE/gemma2-27b-it -name "*.jsonl") + +MERGED_FILE=$OUTPUT_DIR/merged.jsonl + +for FILE in "${FILES[@]}"; do + cat $FILE >> $MERGED_FILE +done + +wc -l $MERGED_FILE diff --git a/tools/dataset/merge_magpie_ultra.sh b/tools/dataset/merge_magpie_ultra.sh new file mode 100644 index 0000000..7513078 --- /dev/null +++ b/tools/dataset/merge_magpie_ultra.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data + +mkdir -p $OUTPUT_DIR +rm $OUTPUT_DIR/merged.jsonl + +FILES=( + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00000-of-00002_1.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00000-of-00002_2.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00000-of-00002_3.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00001-of-00002_1.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00001-of-00002_2.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00001-of-00002_3.jsonl" +) + +MERGED_FILE=$OUTPUT_DIR/merged.jsonl + +for FILE in "${FILES[@]}"; do + cat $FILE >> $MERGED_FILE +done + +wc -l $MERGED_FILE diff --git a/tools/dataset/merge_next_token.sh b/tools/dataset/merge_next_token.sh new file mode 100644 index 0000000..72c43e3 --- /dev/null +++ b/tools/dataset/merge_next_token.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k/format + +mkdir -p $OUTPUT_DIR + +FILES=$(find /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k/format -name "*.jsonl") + +MERGED_FILE=$OUTPUT_DIR/merged.jsonl + +for FILE in "${FILES[@]}"; do + cat $FILE >> $MERGED_FILE +done + +wc -l $MERGED_FILE diff --git a/tools/dataset/next_token_prediciton.py b/tools/dataset/next_token_prediciton.py new file mode 100644 index 0000000..903ce63 --- /dev/null +++ b/tools/dataset/next_token_prediciton.py @@ -0,0 +1,24 @@ +import argparse +import json + +def check_jsonl_file(file_path): + with open(file_path, 'r') as file: + for line_number, line in enumerate(file, 1): + try: + json_obj = json.loads(line) + if json_obj.get('role') == 'next_token_prediction': + pass + else: + print(f"Line {line_number}: 'role': 'next_token_prediction' not found") + except json.JSONDecodeError: + print(f"Line {line_number}: Invalid JSON") + +def main(): + parser = argparse.ArgumentParser(description="Check JSONL file for 'role': 'next_token_prediction'") + parser.add_argument('--file_path', help='Path to the JSONL file') + args = parser.parse_args() + + check_jsonl_file(args.file_path) + +if __name__ == '__main__': + main() diff --git a/tools/inference/inference.py b/tools/inference/inference.py index e0a1a58..2991255 100644 --- a/tools/inference/inference.py +++ b/tools/inference/inference.py @@ -9,6 +9,7 @@ parser.add_argument("--model-path", type=str) parser.add_argument("--tokenizer-path", type=str) parser.add_argument("--prompt", type=str, default=None) +parser.add_argument("--chat-template", action="store_true") args = parser.parse_args() @@ -22,11 +23,21 @@ device_map="auto", torch_dtype=torch.bfloat16 ) -input_ids: torch.Tensor = tokenizer.encode( # type: ignore - args.prompt, - add_special_tokens=False, - return_tensors="pt" -) +if args.chat_template: + input_ids = tokenizer.apply_chat_template( # type: ignore + [ + {"role": "system", "content": "あなたは誠実で優秀な日本人のアシスタントです。"}, + {"role": "user", "content": args.prompt}, + ], + tokenize=True, + return_tensors="pt" + ) +else: + input_ids: torch.Tensor = tokenizer.encode( # type: ignore + args.prompt, + add_special_tokens=False, + return_tensors="pt" + ) outputs = model.generate( # type: ignore input_ids.to(device=model.device), # type: ignore max_new_tokens=1024, @@ -35,5 +46,5 @@ do_sample=True, ) -generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) +generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False) print(generated_text) diff --git a/tools/inference/inference_abci.sh b/tools/inference/inference_abci.sh new file mode 100644 index 0000000..3544d80 --- /dev/null +++ b/tools/inference/inference_abci.sh @@ -0,0 +1,53 @@ +#!/bin/bash +#$ -l rt_AF=1 +#$ -l h_rt=0:01:00:00 +#$ -j y +#$ -o outputs/inference/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +set -e + +# swich virtual env +source .env/bin/activate + +INFERENCE_MODEL_DIR=/bb/llm/gaf51275/2024/checkpoints/pytorch-to-hf/Llama-3.1-8B-Instruct/exp2-4/LR_1e-6_MINLR_1e-7_WD_0.1_GC_1/iter_0004000 + +python tools/inference/inference.py \ + --model-path $INFERENCE_MODEL_DIR \ + --tokenizer-path $INFERENCE_MODEL_DIR \ + --prompt "Please explain Credit Default Swaps." \ + --chat-template + +python tools/inference/inference.py \ + --model-path $INFERENCE_MODEL_DIR \ + --tokenizer-path $INFERENCE_MODEL_DIR \ + --prompt "会社法について説明してください。" \ + --chat-template + +python tools/inference/inference.py \ + --model-path $INFERENCE_MODEL_DIR \ + --tokenizer-path $INFERENCE_MODEL_DIR \ + --prompt "東京工業大学のキャンパスはどこにありますか?" \ + --chat-template + +python tools/inference/inference.py \ + --model-path $INFERENCE_MODEL_DIR \ + --tokenizer-path $INFERENCE_MODEL_DIR \ + --prompt "1+4+8の答えはいくつでしょうか?" \ + --chat-template + +python tools/inference/inference.py \ + --model-path $INFERENCE_MODEL_DIR \ + --tokenizer-path $INFERENCE_MODEL_DIR \ + --prompt "Pythonでデータ構造のUnionFindクラスを作成してください。" \ + --chat-template diff --git a/tools/pre-process/scripts/index.sh b/tools/pre-process/scripts/index.sh index 2a46f3f..16a3f84 100644 --- a/tools/pre-process/scripts/index.sh +++ b/tools/pre-process/scripts/index.sh @@ -2,8 +2,17 @@ source .env/bin/activate -INPUT_DIR=/gs/bs/tga-NII-LLM/datasets/raw/instruct/synthetic/general/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k +INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1 -# baseline python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k.jsonl + --data-file-path $INPUT_DIR/train.jsonl + +INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-3 + +python tools/pre-process/index_dataset.py \ + --data-file-path $INPUT_DIR/train.jsonl + +INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-4 + +python tools/pre-process/index_dataset.py \ + --data-file-path $INPUT_DIR/train.jsonl