From 58ce89f144b1b3979a65fc03bbabb9b128d985ba Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 2 Sep 2024 14:48:46 +0900 Subject: [PATCH 01/44] feat: distributed-timeout setting --- src/llama_recipes/arguments.py | 4 ++++ src/llama_recipes/finetuning.py | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py index 80d872b..3241a61 100644 --- a/src/llama_recipes/arguments.py +++ b/src/llama_recipes/arguments.py @@ -67,6 +67,10 @@ def _add_fsdp_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: group.add_argument( "--use-dist-ckpt", action="store_true" ) + group.add_argument( + '--distributed-timeout-minutes', type=int, default=10, + help='Timeout minutes for torch.distributed.' + ) return parser diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py index 786b946..3b6094c 100644 --- a/src/llama_recipes/finetuning.py +++ b/src/llama_recipes/finetuning.py @@ -69,7 +69,10 @@ def main() -> None: args.gradient_accumulation_steps = args.global_batch_size // (args.micro_batch_size * world_size) assert args.gradient_accumulation_steps >= 1 - torch_distributed.init_process_group(backend="nccl", world_size=world_size, rank=rank) + timeout = timedelta(minutes=args.distributed_timeout_minutes) + torch_distributed.init_process_group( + backend="nccl", world_size=world_size, rank=rank, timeout=timeout, + ) # wandb setting if args.wandb_name is not None and is_rank_0(): From 47a786d6c111089a3da8663fd33760161300c3f7 Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 2 Sep 2024 15:50:04 +0900 Subject: [PATCH 02/44] feat: make anyprecision optimizer deprecated --- src/llama_recipes/arguments.py | 2 +- src/llama_recipes/finetuning.py | 27 ++++++++------------------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py index 3241a61..0337321 100644 --- a/src/llama_recipes/arguments.py +++ b/src/llama_recipes/arguments.py @@ -208,7 +208,7 @@ def _add_training_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPars # optimizer group.add_argument( '--optimizer', type=str, default='adam', - choices=['adam', 'anyprecision'], + choices=['adam'], help='Optimizer function' ) group.add_argument( diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py index 3b6094c..d771c90 100644 --- a/src/llama_recipes/finetuning.py +++ b/src/llama_recipes/finetuning.py @@ -1,6 +1,7 @@ import copy import os import sys +from datetime import timedelta import torch import torch.distributed as torch_distributed @@ -258,25 +259,13 @@ def main() -> None: else: raise ValueError("unknown training mode") - if args.bf16 and args.optimizer == "anyprecision": - optimizer = AnyPrecisionAdamW( - model.parameters(), # type: ignore - lr=args.lr, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps, - momentum_dtype=torch.bfloat16, - variance_dtype=torch.bfloat16, - use_kahan_summation=False, - weight_decay=args.weight_decay, - ) - else: - optimizer = optim.AdamW( - model.parameters(), # type: ignore - lr=args.lr, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps, - weight_decay=args.weight_decay, - ) + optimizer = optim.AdamW( + model.parameters(), # type: ignore + lr=args.lr, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + weight_decay=args.weight_decay, + ) if args.load: if args.use_dist_ckpt: From 85404b8000cf5d90845b59de28021bddc06a95a2 Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 2 Sep 2024 15:52:02 +0900 Subject: [PATCH 03/44] fix: loss curve different bug when checkpoint load --- src/llama_recipes/utils/train_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py index cff3b2d..daf3d24 100644 --- a/src/llama_recipes/utils/train_utils.py +++ b/src/llama_recipes/utils/train_utils.py @@ -83,7 +83,7 @@ def train( if args.instruction_tuning or args.direct_preference_optimization: assert args.continual_pretraining is False print_rank_0(f"Skipping {iteration} batches") - for _ in range(iteration): + for _ in range(iteration * gradient_accumulation_steps): next(train_dataloader) while iteration < args.train_iters: From 9d7f02326a1a85273e7dcdce8009cd3e604eeedc Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 2 Sep 2024 15:52:29 +0900 Subject: [PATCH 04/44] fix: make meta device load deprecated --- src/llama_recipes/get_models.py | 36 +++++++++------------------------ 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/src/llama_recipes/get_models.py b/src/llama_recipes/get_models.py index 05a9851..d1bdf78 100644 --- a/src/llama_recipes/get_models.py +++ b/src/llama_recipes/get_models.py @@ -34,33 +34,15 @@ def get_model( init_time = time.perf_counter() if "Llama" in model_name or "Swallow" in model_name: - if args.low_cpu_fsdp: - """ - for FSDP, we can save cpu memory by loading pretrained model on rank0 only. - this avoids cpu oom when loading large models like llama 70B, in which case - model alone would consume 2+TB cpu mem (70 * 4 * 8). This will add some communications - overhead. - """ - if is_rank_0(): - model = LlamaForCausalLM.from_pretrained( - model_name, - load_in_8bit=True if args.quantization else None, - device_map="auto" if args.quantization else None, - use_cache=use_cache, - ) - else: - llama_config = LlamaConfig.from_pretrained(model_name) - llama_config.use_cache = use_cache - with torch.device("meta"): - model = LlamaForCausalLM(llama_config) - - else: - model = LlamaForCausalLM.from_pretrained( - model_name, - load_in_8bit=True if args.quantization else None, - device_map="auto" if args.quantization else None, - use_cache=use_cache, - ) + model = LlamaForCausalLM.from_pretrained( + model_name, + load_in_8bit=True if args.quantization else None, + device_map="auto" if args.quantization else None, + use_cache=use_cache, + max_position_embeddings=args.seq_length, + attn_implementation="flash_attention_2", + torch_dtype=torch.bfloat16 if args.bf16 else torch.float16, + ) elif "Mistral" in model_name or "mistral" in model_name or "Codestral" in model_name: # If using torch.device("meta"), FSDP training hang From 9f2dc2ff776886b396d53957f16d7c68a26c5537 Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 2 Sep 2024 15:55:49 +0900 Subject: [PATCH 05/44] chore: update spell checker's ignored words --- .vscode/settings.json | 1 + 1 file changed, 1 insertion(+) diff --git a/.vscode/settings.json b/.vscode/settings.json index bbadf97..c99ea13 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -47,6 +47,7 @@ "pbar", "peft", "plamo", + "pretraining", "probs", "psutil", "pubmed", From 5981e227a582493d2779e1bbdb69cd1e6e6f7454 Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 2 Sep 2024 16:17:23 +0900 Subject: [PATCH 06/44] feat: torch profiler --- src/llama_recipes/arguments.py | 33 ++++++++++++++++++++++++ src/llama_recipes/utils/train_utils.py | 35 ++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py index 0337321..2addf03 100644 --- a/src/llama_recipes/arguments.py +++ b/src/llama_recipes/arguments.py @@ -10,6 +10,7 @@ def parse_args() -> argparse.Namespace: parser = _add_training_args(parser=parser) parser = _add_regularization_args(parser=parser) parser = _add_instruction_tuning_args(parser=parser) + parser = _add_torch_profiler_args(parser=parser) args = parser.parse_args() @@ -340,3 +341,35 @@ def _add_instruction_tuning_args(parser: argparse.ArgumentParser) -> argparse.Ar ) return parser + + +def _add_torch_profiler_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + group = parser.add_argument_group(title='torch profiler') + + group.add_argument('--torch-profile', action='store_true', help='Enable torch profiler') + group.add_argument( + '--torch-profile-ranks', nargs='+', type=int, default=[0], help='Global ranks to profile' + ) + group.add_argument('--torch-profile-wait', type=int, default=0, help='Steps to wait before profiling') + group.add_argument('--torch-profile-warmup', type=int, default=1, help='Warmup steps before profiling') + group.add_argument('--torch-profile-active', type=int, default=1, help='Steps to profile') + group.add_argument( + '--torch-profile-repeat', type=int, default=1, help='Repeat profiling this number of times' + ) + group.add_argument( + '--torch-profile-skip-first', type=int, default=1, + help='Number of iterations to skip before profiling' + ) + group.add_argument('--torch-profile-record-shapes', action='store_true', + help='Save information about operator’s input shapes') + group.add_argument('--torch-profile-profile-memory', action='store_true', + help='Track tensor memory allocation/deallocation') + group.add_argument('--torch-profile-with-stack', action='store_true', + help='Record source information for the ops') + group.add_argument( + '--torch-profile-with-flops', action='store_true', help='Use formula to estimate the FLOPs' + ) + group.add_argument('--torch-profile-with-modules', action='store_true', help='Record module hierarchy ') + group.add_argument('--tensorboard-dir', type=str, default=None) + + return parser diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py index daf3d24..34c8534 100644 --- a/src/llama_recipes/utils/train_utils.py +++ b/src/llama_recipes/utils/train_utils.py @@ -1,5 +1,6 @@ import os import time +import sys import torch import torch.cuda.nccl as nccl @@ -86,6 +87,34 @@ def train( for _ in range(iteration * gradient_accumulation_steps): next(train_dataloader) + # profile + torch_profile_on = args.torch_profile and ( + torch_distributed.get_rank() in args.torch_profile_ranks + ) + if torch_profile_on: + profiler_context = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + schedule=torch.profiler.schedule( + wait=args.torch_profile_wait, + warmup=args.torch_profile_warmup, + active=args.torch_profile_active, + repeat=args.torch_profile_repeat, + skip_first=args.torch_profile_skip_first, + ), + on_trace_ready=torch.profiler.tensorboard_trace_handler( + args.tensorboard_dir, use_gzip=False + ), + record_shapes=args.torch_profile_record_shapes, + profile_memory=args.torch_profile_profile_memory, + with_stack=args.torch_profile_with_stack, + with_flops=args.torch_profile_with_flops, + with_modules=args.torch_profile_with_modules, + ) + prof = profiler_context.__enter__() + while iteration < args.train_iters: iteration_start_time = time.perf_counter() @@ -242,6 +271,12 @@ def train( iteration=iteration, ) + # pytorch profiler + if torch_profile_on: + prof.step() + + if torch_profile_on: + profiler_context.__exit__(*sys.exc_info()) torch_distributed.barrier() save_checkpoint( model=model, # type: ignore From f81cbd4ca20c209035bd6c74ca81d49cec9e51ae Mon Sep 17 00:00:00 2001 From: kazuki Date: Tue, 3 Sep 2024 00:08:05 +0900 Subject: [PATCH 07/44] feat: instruction tuning (best setting) --- .vscode/settings.json | 1 + .../Llama-3-8B/Llama-3-8B-instruct-v0.2.sh | 8 ++++---- src/llama_recipes/arguments.py | 18 ++++++++++++------ src/llama_recipes/finetuning.py | 7 +++++++ src/llama_recipes/get_fsdp.py | 3 +++ src/llama_recipes/utils/train_utils.py | 2 +- 6 files changed, 28 insertions(+), 11 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index c99ea13..19bc179 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -58,6 +58,7 @@ "stabilityai", "stablelm", "stockmark", + "tensorboard", "tflops", "tobytes", "Xformer" diff --git a/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh b/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh index 4ebf185..7dca54d 100644 --- a/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh +++ b/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh @@ -1,6 +1,6 @@ #!/bin/bash -#$ -l rt_AF=2 -#$ -l h_rt=0:01:00:00 +#$ -l rt_AF=1 +#$ -l h_rt=0:08:00:00 #$ -j y #$ -o outputs/instruction/Llama-3-8B/ #$ -cwd @@ -82,7 +82,7 @@ mpirun -np $NUM_GPUS \ -x MASTER_ADDR=$MASTER_ADDR \ -x MASTER_PORT=$MASTER_PORT \ -bind-to none \ - -x PATH \ + -x NCCL_IB_TIMEOUT=22 \ -x LD_LIBRARY_PATH \ -x PATH \ python examples/finetuning.py \ @@ -102,7 +102,7 @@ mpirun -np $NUM_GPUS \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ - --save-interval 500 \ + --save-interval 10 \ --eval-interval 500 \ --eval-iters 10 \ --bf16 \ diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py index 2addf03..a2d2c48 100644 --- a/src/llama_recipes/arguments.py +++ b/src/llama_recipes/arguments.py @@ -360,12 +360,18 @@ def _add_torch_profiler_args(parser: argparse.ArgumentParser) -> argparse.Argume '--torch-profile-skip-first', type=int, default=1, help='Number of iterations to skip before profiling' ) - group.add_argument('--torch-profile-record-shapes', action='store_true', - help='Save information about operator’s input shapes') - group.add_argument('--torch-profile-profile-memory', action='store_true', - help='Track tensor memory allocation/deallocation') - group.add_argument('--torch-profile-with-stack', action='store_true', - help='Record source information for the ops') + group.add_argument( + '--torch-profile-record-shapes', action='store_true', + help='Save information about operator’s input shapes' + ) + group.add_argument( + '--torch-profile-profile-memory', action='store_true', + help='Track tensor memory allocation/deallocation' + ) + group.add_argument( + '--torch-profile-with-stack', action='store_true', + help='Record source information for the ops' + ) group.add_argument( '--torch-profile-with-flops', action='store_true', help='Use formula to estimate the FLOPs' ) diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py index d771c90..15ee388 100644 --- a/src/llama_recipes/finetuning.py +++ b/src/llama_recipes/finetuning.py @@ -145,6 +145,9 @@ def main() -> None: model_name=args.base_model, ) + from torch.distributed._tensor.device_mesh import init_device_mesh # type: ignore + device_mesh = init_device_mesh(device_type="cuda", mesh_shape=(world_size, )) + model = FSDP( model, # type: ignore auto_wrap_policy=wrapping_policy, @@ -159,8 +162,12 @@ def main() -> None: ) if args.low_cpu_fsdp and rank != 0 else None, + device_mesh=device_mesh, ) if args.fsdp_activation_checkpointing: + # ref: https://github.com/meta-llama/llama-recipes/blob/778e31e35cfbe385a31b3a94b794e3f75e276d1a/src/llama_recipes/finetuning.py#L193-L195 + # model.enable_input_require_grads() + # model.gradient_checkpointing_enable() apply_fsdp_checkpointing(model=model, model_name=args.base_model) if args.direct_preference_optimization: diff --git a/src/llama_recipes/get_fsdp.py b/src/llama_recipes/get_fsdp.py index 615dda2..4b7d1ca 100644 --- a/src/llama_recipes/get_fsdp.py +++ b/src/llama_recipes/get_fsdp.py @@ -12,6 +12,9 @@ def get_sharding_strategy() -> ShardingStrategy: elif args.sharding_strategy == "NO_SHARD": return ShardingStrategy.NO_SHARD elif args.sharding_strategy == "HYBRID_SHARD": + # TODO: https://pytorch.org/tutorials/recipes/distributed_device_mesh.html#how-to-use-devicemesh-with-hsdp + # support device mesh + # ref: https://github.com/meta-llama/llama-recipes/blob/778e31e35cfbe385a31b3a94b794e3f75e276d1a/src/llama_recipes/finetuning.py#L160 return ShardingStrategy.HYBRID_SHARD elif args.sharding_strategy == "_HYBRID_SHARD_ZERO2": return ShardingStrategy._HYBRID_SHARD_ZERO2 diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py index 34c8534..b6c13c7 100644 --- a/src/llama_recipes/utils/train_utils.py +++ b/src/llama_recipes/utils/train_utils.py @@ -83,7 +83,7 @@ def train( # skip batch if args.instruction_tuning or args.direct_preference_optimization: assert args.continual_pretraining is False - print_rank_0(f"Skipping {iteration} batches") + print_rank_0(f"Skipping {iteration} iterations") for _ in range(iteration * gradient_accumulation_steps): next(train_dataloader) From 5bd284e7812f6ab7bf972408c9e6328035d9d2f4 Mon Sep 17 00:00:00 2001 From: kazuki Date: Sun, 8 Sep 2024 16:59:06 +0900 Subject: [PATCH 08/44] feat: dataset merge --- tools/dataset/fileter.py | 46 ++++++++++++++++++++++++++++++++++ tools/dataset/merge_dataset.sh | 29 ++++++++++++++------- 2 files changed, 66 insertions(+), 9 deletions(-) create mode 100644 tools/dataset/fileter.py diff --git a/tools/dataset/fileter.py b/tools/dataset/fileter.py new file mode 100644 index 0000000..428722b --- /dev/null +++ b/tools/dataset/fileter.py @@ -0,0 +1,46 @@ +import argparse +import json +from typing import List, Dict + + +def process_jsonl(file_path: str, threshold: float) -> List[Dict]: + filtered_data = [] + with open(file_path, "r") as file: + for line in file: + entry = json.loads(line) + if "overall" not in entry["scores"]: + continue + + if entry["scores"]["overall"] >= threshold: + conversations = entry["conversations"] + # Get all messages except the last assistant message + input_messages = conversations[:-1] + assert len(conversations) % 2 == 0 + # Get only the last assistant message + output_message = conversations[-1] + assert output_message["role"] == "assistant" + assert type(output_message) is dict + filtered_data.append({"input": input_messages, "output": output_message}) + return filtered_data + + +def main(): + parser = argparse.ArgumentParser(description="Filter JSONL file based on score threshold") + parser.add_argument("--input_file", type=str, help="Path to input JSONL file") + parser.add_argument("--output_file", type=str, help="Path to output JSONL file") + parser.add_argument("--threshold", type=int, default=4, help="Score threshold for filtering (default: 0.0)") + + args = parser.parse_args() + + filtered_data = process_jsonl(args.input_file, args.threshold) + + with open(args.output_file, "w", encoding="utf-8") as outfile: + for entry in filtered_data: + json.dump(entry, outfile, ensure_ascii=False) + outfile.write("\n") + + print(f"Processed data has been written to {args.output_file}") + + +if __name__ == "__main__": + main() diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index 837c86c..046e275 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -1,17 +1,28 @@ #!/bin/bash -INPUT_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/formatted -OUTPUT_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/imitation_2_oasst2_top1 +INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja +OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1 mkdir -p $OUTPUT_DIR -cat $INPUT_DIR/oasst1-21k-ja-mixtral-imitation_2.jsonl $INPUT_DIR/oasst2-top1-en.jsonl > $OUTPUT_DIR/merged.jsonl +FILES=( + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_1.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/lm_scored.jsonl" +) -echo "Merged dataset is saved at $OUTPUT_DIR/merged.jsonl" +MERGED_FILE=$OUTPUT_DIR/merged.jsonl -# swich virtual env -source .env/bin/activate +for FILE in "${FILES[@]}"; do + cat $FILE >> $MERGED_FILE +done -python tools/dataset/shuffle_and_split.py \ - --input $OUTPUT_DIR/merged.jsonl \ - --output $OUTPUT_DIR +# fileter +python tools/dataset/fileter.py \ + --input_file $MERGED_FILE \ + --output_file $OUTPUT_DIR/train.jsonl \ + --threshold 0 + +rm $MERGED_FILE From d9982b6fb067da2029356deb338e1bcb1753dddf Mon Sep 17 00:00:00 2001 From: kazuki Date: Sun, 8 Sep 2024 17:00:14 +0900 Subject: [PATCH 09/44] feat: exp1-3 dataset merge --- tools/dataset/merge_dataset.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index 046e275..9915741 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -1,7 +1,7 @@ #!/bin/bash INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja -OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1 +OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-3 mkdir -p $OUTPUT_DIR @@ -23,6 +23,10 @@ done python tools/dataset/fileter.py \ --input_file $MERGED_FILE \ --output_file $OUTPUT_DIR/train.jsonl \ - --threshold 0 + --threshold 4 rm $MERGED_FILE + +echo "Done" + +wc -l $OUTPUT_DIR/train.jsonl From dda376c68b7ac7896b08f0d5a8d0a2197f5c216e Mon Sep 17 00:00:00 2001 From: kazuki Date: Sun, 8 Sep 2024 17:03:27 +0900 Subject: [PATCH 10/44] feat: exp1-4 dataset merge --- tools/dataset/merge_dataset.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index 9915741..50e2884 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -1,7 +1,7 @@ #!/bin/bash INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja -OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-3 +OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-4 mkdir -p $OUTPUT_DIR @@ -23,7 +23,7 @@ done python tools/dataset/fileter.py \ --input_file $MERGED_FILE \ --output_file $OUTPUT_DIR/train.jsonl \ - --threshold 4 + --threshold 7 rm $MERGED_FILE From f198b1c86db73e424b831f52e4fab8a0e3f250e4 Mon Sep 17 00:00:00 2001 From: kazuki Date: Sun, 8 Sep 2024 21:54:19 +0900 Subject: [PATCH 11/44] feat: Llama-3.1-instruct --- .../Llama-3.1-8B-instruct-exp1-1.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp1-2.sh | 122 ++++++++++++++++++ scripts/index.sh | 11 -- tools/pre-process/scripts/index.sh | 15 ++- 4 files changed, 256 insertions(+), 14 deletions(-) create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh delete mode 100644 scripts/index.sh diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh new file mode 100644 index 0000000..2db9f5c --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=1 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-1/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1 + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-1.1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh new file mode 100644 index 0000000..612b1ac --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=1 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2e-6 +MIN_LR=2e-7 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1 + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-1.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/index.sh b/scripts/index.sh deleted file mode 100644 index 9871629..0000000 --- a/scripts/index.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -#$ -l rt_F=1 -#$ -l h_rt=1:0:00:00 -#$ -j y -#$ -o outputs/index/ -#$ -cwd - -# swich virtual env -source .env/bin/activate - -python src/llama_recipes/datasets/index.py diff --git a/tools/pre-process/scripts/index.sh b/tools/pre-process/scripts/index.sh index 2a46f3f..16a3f84 100644 --- a/tools/pre-process/scripts/index.sh +++ b/tools/pre-process/scripts/index.sh @@ -2,8 +2,17 @@ source .env/bin/activate -INPUT_DIR=/gs/bs/tga-NII-LLM/datasets/raw/instruct/synthetic/general/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k +INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1 -# baseline python tools/pre-process/index_dataset.py \ - --data-file-path $INPUT_DIR/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k.jsonl + --data-file-path $INPUT_DIR/train.jsonl + +INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-3 + +python tools/pre-process/index_dataset.py \ + --data-file-path $INPUT_DIR/train.jsonl + +INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-4 + +python tools/pre-process/index_dataset.py \ + --data-file-path $INPUT_DIR/train.jsonl From 13153798b568ef85203467617e969c275d48b97a Mon Sep 17 00:00:00 2001 From: kazuki Date: Sun, 8 Sep 2024 22:06:06 +0900 Subject: [PATCH 12/44] feat: Llama-3.1-8B instruct --- .../Llama-3.1-8B-instruct-exp1-3.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp1-4.sh | 122 ++++++++++++++++++ 2 files changed, 244 insertions(+) create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-3.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-3.sh new file mode 100644 index 0000000..972aec0 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-3.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=1 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-3/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-3 + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-1.3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh new file mode 100644 index 0000000..127120f --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=1 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2e-6 +MIN_LR=2e-7 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-4/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-4 + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-1.4-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" From 80347a88e4799983915daf2a0ba41096ef8b5e4d Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 9 Sep 2024 01:14:17 +0900 Subject: [PATCH 13/44] feat: dataset merge --- tools/dataset/extract_jsonl.py | 65 ++++++++++++++++++++++ tools/dataset/lmsys_dataset.py | 90 +++++++++++++++++++++++++++++++ tools/dataset/merge_dataset.sh | 48 +++++++++++++++-- tools/dataset/merge_next_token.sh | 15 ++++++ 4 files changed, 214 insertions(+), 4 deletions(-) create mode 100644 tools/dataset/extract_jsonl.py create mode 100644 tools/dataset/lmsys_dataset.py create mode 100644 tools/dataset/merge_next_token.sh diff --git a/tools/dataset/extract_jsonl.py b/tools/dataset/extract_jsonl.py new file mode 100644 index 0000000..164be17 --- /dev/null +++ b/tools/dataset/extract_jsonl.py @@ -0,0 +1,65 @@ +import argparse +import json +import random +from pathlib import Path + + +def count_lines(file_path): + with open(file_path, "r", encoding="utf-8") as f: + return sum(1 for _ in f) + + +def extract_random_lines(input_path, output_path, num_lines): + total_lines = count_lines(input_path) + + if num_lines >= total_lines: + print( + f"Warning: Requested {num_lines} lines, but file only contains {total_lines} lines. Extracting all lines." + ) + num_lines = total_lines + + selected_indices = set(random.sample(range(total_lines), num_lines)) + + with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile: + for i, line in enumerate(infile): + if i in selected_indices: + try: + # Verify that the line is valid JSON + json.loads(line.strip()) + outfile.write(line) + except json.JSONDecodeError: + print(f"Warning: Invalid JSON on line {i+1}. Skipping.") + selected_indices.remove(i) + if not selected_indices: + break + + +def main(): + parser = argparse.ArgumentParser(description="Extract specified number of random lines from a JSONL file.") + parser.add_argument("--input-path", required=True, help="Path to the input JSONL file") + parser.add_argument("--output-path", required=True, help="Path to the output JSONL file") + parser.add_argument("--num-lines", type=int, required=True, help="Number of lines to extract") + parser.add_argument("--seed", type=int, help="Random seed for reproducibility") + + args = parser.parse_args() + + input_path = Path(args.input_path) + output_path = Path(args.output_path) + + if not input_path.exists(): + print(f"Error: Input file '{input_path}' does not exist.") + return + + if not input_path.is_file(): + print(f"Error: '{input_path}' is not a file.") + return + + if args.seed is not None: + random.seed(args.seed) + + extract_random_lines(input_path, output_path, args.num_lines) + print(f"Extracted {args.num_lines} random lines from '{input_path}' to '{output_path}'.") + + +if __name__ == "__main__": + main() diff --git a/tools/dataset/lmsys_dataset.py b/tools/dataset/lmsys_dataset.py new file mode 100644 index 0000000..4d2d725 --- /dev/null +++ b/tools/dataset/lmsys_dataset.py @@ -0,0 +1,90 @@ +import argparse +import json +import hashlib +from typing import Any + + +def process_sample(sample: dict[str, Any]) -> dict[str, Any] | None: + conversation = sample.get("conversation", []) + if len(conversation) < 2: + return None + + user_message = conversation[0] + assistant_message = conversation[1] + + if not user_message.get("content") or not assistant_message.get("content"): + return None + + result = { + "input": [{"role": "user", "content": user_message["content"]}], + "output": {"role": "assistant", "content": assistant_message["content"]}, + "conversation": sample, + "redacted": "NAME_" in user_message["content"] or "NAME_" in assistant_message["content"], + } + + return result + + +def hash_sample(sample: dict[str, Any]) -> str: + return hashlib.md5(json.dumps(sample, sort_keys=True).encode()).hexdigest() + + +def main(input_file: str, output_file: str, include_redacted: bool): + with open(input_file, "r", encoding="utf-8") as f: + data = [json.loads(line) for line in f] + + processed_samples = [] + hash_set = set() + invalid_count = 0 + redacted_count = 0 + non_redacted_count = 0 + + for sample in data: + processed = process_sample(sample) + if processed: + if processed["redacted"]: + redacted_count += 1 + if include_redacted: + sample_hash = hash_sample(processed) + if sample_hash not in hash_set: + hash_set.add(sample_hash) + processed_samples.append(processed) + else: + print(f"Duplicate redacted sample found: {sample}") + else: + non_redacted_count += 1 + sample_hash = hash_sample(processed) + if sample_hash not in hash_set: + hash_set.add(sample_hash) + processed_samples.append(processed) + else: + print(f"Duplicate non-redacted sample found: {sample}") + else: + print(f"Invalid sample: {sample}") + invalid_count += 1 + + with open(output_file, "w", encoding="utf-8") as f: + for sample in processed_samples: + json.dump(sample, f, ensure_ascii=False) + f.write("\n") + + print(f"Processed {len(processed_samples)} unique samples.") + print(f"Found {invalid_count} invalid samples.") + print(f"Total samples: {len(data)}") + print(f"Unique non-redacted samples: {non_redacted_count}") + print(f"Redacted samples: {redacted_count}") + if include_redacted: + print("Redacted samples included in output") + else: + print("Redacted samples not included in output") + print(f"Output written to {output_file}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert JSON to specified JSONL format") + parser.add_argument("--input-file", required=True, help="Input JSON file path") + parser.add_argument("--output-file", required=True, help="Output JSONL file path") + parser.add_argument("--include-redacted", action="store_true", help="Include redacted samples in output") + args = parser.parse_args() + + main(args.input_file, args.output_file, args.include_redacted) diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index 50e2884..9169b52 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -1,7 +1,16 @@ #!/bin/bash -INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja -OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-4 +set -e + +INCLUDE_REDACTED=true +FILTERD_SCORE=7 +NEXT_TOKEN_PERCENT=0.25 + +OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT + +if $INCLUDE_REDACTED; then + OUTPUT_DIR=$OUTPUT_DIR-redacted +fi mkdir -p $OUTPUT_DIR @@ -23,10 +32,41 @@ done python tools/dataset/fileter.py \ --input_file $MERGED_FILE \ --output_file $OUTPUT_DIR/train.jsonl \ - --threshold 7 + --threshold $FILTERD_SCORE rm $MERGED_FILE -echo "Done" +echo "Filtered open assistant data:" +wc -l $OUTPUT_DIR/train.jsonl + +if $INCLUDE_REDACTED; then + LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train.jsonl +else + LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-no-redacted.jsonl +fi + +cat $LMSYS_FILE >> $OUTPUT_DIR/train.jsonl + +INSTRUCTION_SAMPLES=$(wc -l $OUTPUT_DIR/train.jsonl | awk '{print $1}') +NEXT_TOKEN_SAMPLES=$(echo "$INSTRUCTION_SAMPLES * $NEXT_TOKEN_PERCENT / 1" | bc) +python tools/dataset/extract_jsonl.py \ + --input-path /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k/format/merged.jsonl \ + --output-path $OUTPUT_DIR/next-token.jsonl \ + --num-lines $NEXT_TOKEN_SAMPLES \ + --seed 1234 + +echo "Next token data:" +wc -l $OUTPUT_DIR/next-token.jsonl + +cat $OUTPUT_DIR/next-token.jsonl >> $OUTPUT_DIR/train.jsonl + +echo "Total data:" wc -l $OUTPUT_DIR/train.jsonl + +rm $OUTPUT_DIR/next-token.jsonl + +# indexing + +python tools/pre-process/index_dataset.py \ + --data-file-path $OUTPUT_DIR/train.jsonl diff --git a/tools/dataset/merge_next_token.sh b/tools/dataset/merge_next_token.sh new file mode 100644 index 0000000..4daff5e --- /dev/null +++ b/tools/dataset/merge_next_token.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k/format + +mkdir -p $OUTPUT_DIR + +FILES=$(find /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k -name "*.jsonl") + +MERGED_FILE=$OUTPUT_DIR/merged.jsonl + +for FILE in "${FILES[@]}"; do + cat $FILE >> $MERGED_FILE +done + +wc -l $MERGED_FILE From 721a1433de9099696c18112ac97b49bea6185568 Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 9 Sep 2024 01:14:35 +0900 Subject: [PATCH 14/44] feat: next token prediction style --- src/llama_recipes/utils/instruction_tuning.py | 48 +++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py index 1cb73aa..a707368 100644 --- a/src/llama_recipes/utils/instruction_tuning.py +++ b/src/llama_recipes/utils/instruction_tuning.py @@ -64,26 +64,34 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: print(f"index={index}, offset={offset}, line={line}, error={e}") exit(1) - SYSTEM_PROMPT: list[dict[str, str]] = [ - { - "role": self.system_prompt_role, - "content": self.system_prompt_content, - } - ] - # chat template - prompt = self.tokenizer.apply_chat_template( - conversation=SYSTEM_PROMPT + conversations["input"], # type: ignore - add_generation_prompt=True, - tokenize=True, - ) - - example = self.tokenizer.apply_chat_template( - conversation=SYSTEM_PROMPT + conversations["input"] + [ # type: ignore - {"role": "assistant", "content": conversations["output"]} - ], - tokenize=True, - ) - tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64) + if 'role' in conversations and conversations["role"] == "next_token_prediction": + prompt = self.tokenizer.bos_token + example = self.tokenizer.encode( + conversations["content"], # type: ignore + add_special_tokens=False + ) + tensor_example = torch.tensor(example, dtype=torch.int64) + else: + SYSTEM_PROMPT: list[dict[str, str]] = [ + { + "role": self.system_prompt_role, + "content": self.system_prompt_content, + } + ] + # chat template + prompt = self.tokenizer.apply_chat_template( + conversation=SYSTEM_PROMPT + conversations["input"], # type: ignore + add_generation_prompt=True, + tokenize=True, + ) + + example = self.tokenizer.apply_chat_template( + conversation=SYSTEM_PROMPT + conversations["input"] + [ # type: ignore + {"role": "assistant", "content": conversations["output"]} + ], + tokenize=True, + ) + tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64) if len(example) > self.max_words: print(f"\n\nWARNING: example={self.tokenizer.decode(example)}\n\n") From 1a34e24456d27e7247ebc262a6baf2cf3cb5b763 Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 9 Sep 2024 11:59:11 +0900 Subject: [PATCH 15/44] feat: lmsys-chat-1m job script --- .../Llama-3.1-8B-instruct-exp2-1.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp2-2.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp2-3.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp2-4.sh | 122 ++++++++++++++++++ 4 files changed, 488 insertions(+) create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh new file mode 100644 index 0000000..2bfe779 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=4 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-1/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh new file mode 100644 index 0000000..8c85d39 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=4 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh new file mode 100644 index 0000000..f833e8d --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=4 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=5e-6 +MIN_LR=5e-7 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-3/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh new file mode 100644 index 0000000..bb48b38 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=4 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-6 +MIN_LR=1e-7 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-4/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-4-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" From ecc287921978ea6fb07a8b64348b492fef1cbb55 Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 9 Sep 2024 15:25:06 +0900 Subject: [PATCH 16/44] feat: checkpoint convert script --- tools/checkpoint-convert/convert_ckpt.py | 21 ++++++--- .../scripts/abci/convert_ckpt_instruct.sh | 44 ++++++++++--------- 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/tools/checkpoint-convert/convert_ckpt.py b/tools/checkpoint-convert/convert_ckpt.py index e8dcb32..be48a4c 100644 --- a/tools/checkpoint-convert/convert_ckpt.py +++ b/tools/checkpoint-convert/convert_ckpt.py @@ -1,35 +1,42 @@ import argparse import torch -from transformers import AutoModelForCausalLM +from transformers import AutoModelForCausalLM, AutoTokenizer def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( - "--model", type=str, required=True, help="HuggingFace transformers model name" + "--hf-base-model-checkpoint-path", type=str, + required=True, help="HuggingFace transformers model name" + ) + parser.add_argument("--hf-tokenizer-path", type=str, required=True) + parser.add_argument( + "--pytorch-model-checkpoint-path", type=str, + required=True, help="Path to checkpoint (`model.pth`)" ) - parser.add_argument("--ckpt", type=str, required=True, help="Path to checkpoint (`model.pth`)") parser.add_argument("--out", type=str, required=True, help="Path to output directory") parser.add_argument("--sequence-length", type=int, required=True) args = parser.parse_args() - print(f"Loading HF model: {args.model}", flush=True) + print(f"Loading HF model: {args.hf_base_model_checkpoint_path}", flush=True) model = AutoModelForCausalLM.from_pretrained( - args.model, + args.hf_base_model_checkpoint_path, torch_dtype=torch.bfloat16, trust_remote_code=True, max_position_embeddings=args.sequence_length, ) + tokenizer = AutoTokenizer.from_pretrained(args.hf_tokenizer_path) - print(f"Loading CKPT: {args.ckpt}", flush=True) - state_dict = torch.load(args.ckpt, map_location="cpu") + print(f"Loading CKPT: {args.pytorch_model_checkpoint_path}", flush=True) + state_dict = torch.load(args.pytorch_model_checkpoint_path, map_location="cpu") print("Loading state dict into HF model", flush=True) model.load_state_dict(state_dict) print("Saving HF model", flush=True) model.save_pretrained(args.out, safe_serialization=True) + tokenizer.save_pretrained(args.out) if __name__ == "__main__": diff --git a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh index e54860b..7f601d4 100644 --- a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh +++ b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh @@ -20,32 +20,34 @@ set -e # swich virtual env source .env/bin/activate -# distributed settings -export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) -export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-1/LR_1e-5_MINLR_1e-6_WD_0.1_GC_1 +LATEST_ITERATION=$(cat ${CHECKPOINT_DIR}/latest_iteration.txt) -echo "MASTER_ADDR=${MASTER_ADDR}" +echo "LATEST_ITERATION=${LATEST_ITERATION}" -start=578 -end=578 -increment=5000 +BASE_MODEL_CHECKPOINT=/bb/llm/gaf51275/hf-checkpoints/Meta-Llama-3.1-8B-Instruct +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +OUTPUT_DIR=/bb/llm/gaf51275/2024/checkpoints/pytorch-to-hf/Llama-3.1-8B-Instruct/ +EXTRACTED_PATH=$(echo $CHECKPOINT_DIR | awk -F'/Llama-3.1-8B-Instruct/' '{print $2}') +OUTPUT_DIR="${OUTPUT_DIR}${EXTRACTED_PATH}" -for ((i = start; i <= end; i += increment)); do - ITERATION=$i - FORMATTED_ITERATION=$(printf "iter_%07d" $ITERATION) +echo "convert ${CHECKPOINT_DIR} to ${OUTPUT_DIR}" +mkdir -p $OUTPUT_DIR - CHECK_POINT_PATH=/bb/llm/gaf51275/llama/checkpoints/Swallow-70b-VE-chat/oasst2-top1-imitation-2-3-lr_1e-5-minlr_1e-6-GB_256/${FORMATTED_ITERATION}/model.pt - OUTPUT_PATH=/bb/llm/gaf51275/llama/converted-hf-checkpoint/Swallow-70b-VE-chat/oasst2-top1-imitation-2-3-lr_1e-5-minlr_1e-6-GB_256/${FORMATTED_ITERATION} +ITERATION=$LATEST_ITERATION +FORMATTED_ITERATION=$(printf "iter_%07d" $ITERATION) - echo "convert ${CHECK_POINT_PATH} to ${OUTPUT_PATH}" +CHECK_POINT_PATH=${CHECKPOINT_DIR}/${FORMATTED_ITERATION}/model.pt +OUTPUT_PATH=${OUTPUT_DIR}/${FORMATTED_ITERATION} - mkdir -p $OUTPUT_PATH +echo "convert ${CHECK_POINT_PATH} to ${OUTPUT_PATH}" - BASE_MODEL_CHECKPOINT=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf +mkdir -p $OUTPUT_PATH - python tools/checkpoint-convert/convert_ckpt.py \ - --model $BASE_MODEL_CHECKPOINT \ - --ckpt $CHECK_POINT_PATH \ - --out $OUTPUT_PATH \ - --sequence-length 4096 -done +# convert +python tools/checkpoint-convert/convert_ckpt.py \ + --hf-base-model-checkpoint-path $BASE_MODEL_CHECKPOINT \ + --hf-tokenizer-path $TOKENIZER_DIR \ + --pytorch-model-checkpoint-path $CHECK_POINT_PATH \ + --out $OUTPUT_PATH \ + --sequence-length 8192 From 652faff5317129294a009b242c2b4009d4479121 Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 9 Sep 2024 16:17:02 +0900 Subject: [PATCH 17/44] chore: update dataset merge script --- tools/dataset/merge_dataset.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index 9169b52..2715f64 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -3,8 +3,8 @@ set -e INCLUDE_REDACTED=true -FILTERD_SCORE=7 -NEXT_TOKEN_PERCENT=0.25 +FILTERD_SCORE=4 +NEXT_TOKEN_PERCENT=0.5 OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT @@ -48,7 +48,7 @@ fi cat $LMSYS_FILE >> $OUTPUT_DIR/train.jsonl INSTRUCTION_SAMPLES=$(wc -l $OUTPUT_DIR/train.jsonl | awk '{print $1}') -NEXT_TOKEN_SAMPLES=$(echo "$INSTRUCTION_SAMPLES * $NEXT_TOKEN_PERCENT / 1" | bc) +NEXT_TOKEN_SAMPLES=$(echo "($INSTRUCTION_SAMPLES / (1 - $NEXT_TOKEN_PERCENT)) * $NEXT_TOKEN_PERCENT / 1" | bc) python tools/dataset/extract_jsonl.py \ --input-path /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k/format/merged.jsonl \ From 11d69183d1fbe581100817406667a22faabe3df5 Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 9 Sep 2024 16:17:51 +0900 Subject: [PATCH 18/44] feat: instruct model upload script --- .../scripts/abci/convert_ckpt_instruct.sh | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh index 7f601d4..25e532c 100644 --- a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh +++ b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh @@ -16,6 +16,7 @@ module load hpcx/2.12 module load gcc/11.4.0 set -e +export HF_HOME="/groups/gag51395/.cache/huggigface" # swich virtual env source .env/bin/activate @@ -51,3 +52,36 @@ python tools/checkpoint-convert/convert_ckpt.py \ --pytorch-model-checkpoint-path $CHECK_POINT_PATH \ --out $OUTPUT_PATH \ --sequence-length 8192 + +# upload +upload_checkpoint() { + local upload_dir=$1 + local repo_name=$2 + local max_retries=5 + local retry_count=0 + + while [ $retry_count -lt $max_retries ]; do + if python scripts/abci/upload/upload.py \ + --ckpt-path "$upload_dir" \ + --repo-name "$repo_name"; then + echo "Successfully uploaded $repo_name" + return 0 + else + echo "Upload failed for $repo_name. Retrying..." + ((retry_count++)) + sleep 5 + fi + done + + echo "Failed to upload $repo_name after $max_retries attempts" + return 1 +} + +EXP_NAME=$(echo $EXTRACTED_PATH | sed 's/\//-/g') +HF_REPO_NAME="tokyotech-llm/Llama-3.1-8B-Instruct-${EXP_NAME}-${FORMATTED_ITERATION}" + +echo "upload ${OUTPUT_PATH} to ${HF_REPO_NAME}" + +if ! upload_checkpoint "$OUTPUT_PATH" "$HF_REPO_NAME"; then + echo "Skipping to next checkpoint after repeated failures for $HF_REPO_NAME" +fi From 09e48db75f78a0ca4395cbd114fb37f1be83138b Mon Sep 17 00:00:00 2001 From: kazuki Date: Tue, 10 Sep 2024 13:38:11 +0900 Subject: [PATCH 19/44] feat: dataset merge & check --- tools/dataset/convert_conversation.py | 40 ++++++++++++++++++++++++++ tools/dataset/merge_dataset.sh | 6 ++-- tools/dataset/merge_next_token.sh | 2 +- tools/dataset/next_token_prediciton.py | 24 ++++++++++++++++ 4 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 tools/dataset/convert_conversation.py create mode 100644 tools/dataset/next_token_prediciton.py diff --git a/tools/dataset/convert_conversation.py b/tools/dataset/convert_conversation.py new file mode 100644 index 0000000..5ca72f8 --- /dev/null +++ b/tools/dataset/convert_conversation.py @@ -0,0 +1,40 @@ +import argparse +import json +import sys + + +def process_jsonl(input_file, output_file): + with open(input_file, "r") as infile, open(output_file, "w") as outfile: + for line in infile: + data = json.loads(line) + + conversations = data.get("conversations", []) + assert len(conversations) >= 2 + + input_data = conversations[:-1] + output_data = conversations[-1] + + data["input"] = input_data + data["output"] = output_data + + json.dump(data, outfile) + outfile.write("\n") + + +def main(): + parser = argparse.ArgumentParser(description="Process JSONL data") + parser.add_argument("--input", help="Input JSONL file") + parser.add_argument("--output", help="Output JSONL file") + + args = parser.parse_args() + + try: + process_jsonl(args.input, args.output) + print(f"Processing complete. Output written to {args.output}") + except Exception as e: + print(f"An error occurred: {str(e)}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index 2715f64..dbabc69 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -3,8 +3,8 @@ set -e INCLUDE_REDACTED=true -FILTERD_SCORE=4 -NEXT_TOKEN_PERCENT=0.5 +FILTERD_SCORE=7 +NEXT_TOKEN_PERCENT=0.25 OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT @@ -19,7 +19,7 @@ FILES=( "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl" "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl" "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl" - "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/lm_scored.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl" ) MERGED_FILE=$OUTPUT_DIR/merged.jsonl diff --git a/tools/dataset/merge_next_token.sh b/tools/dataset/merge_next_token.sh index 4daff5e..72c43e3 100644 --- a/tools/dataset/merge_next_token.sh +++ b/tools/dataset/merge_next_token.sh @@ -4,7 +4,7 @@ OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-predicti mkdir -p $OUTPUT_DIR -FILES=$(find /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k -name "*.jsonl") +FILES=$(find /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k/format -name "*.jsonl") MERGED_FILE=$OUTPUT_DIR/merged.jsonl diff --git a/tools/dataset/next_token_prediciton.py b/tools/dataset/next_token_prediciton.py new file mode 100644 index 0000000..903ce63 --- /dev/null +++ b/tools/dataset/next_token_prediciton.py @@ -0,0 +1,24 @@ +import argparse +import json + +def check_jsonl_file(file_path): + with open(file_path, 'r') as file: + for line_number, line in enumerate(file, 1): + try: + json_obj = json.loads(line) + if json_obj.get('role') == 'next_token_prediction': + pass + else: + print(f"Line {line_number}: 'role': 'next_token_prediction' not found") + except json.JSONDecodeError: + print(f"Line {line_number}: Invalid JSON") + +def main(): + parser = argparse.ArgumentParser(description="Check JSONL file for 'role': 'next_token_prediction'") + parser.add_argument('--file_path', help='Path to the JSONL file') + args = parser.parse_args() + + check_jsonl_file(args.file_path) + +if __name__ == '__main__': + main() From 8e352515fa4ac848292224675dfe837d8919efe2 Mon Sep 17 00:00:00 2001 From: kazuki Date: Tue, 10 Sep 2024 13:38:49 +0900 Subject: [PATCH 20/44] feat: Llama-3.1-8B Instruct exp2 --- .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh | 2 +- .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh | 2 +- .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh | 2 +- .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh index 2bfe779..1751716 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh @@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=4 +MICRO_BATCH_SIZE=2 GLOBAL_BATCH_SIZE=256 # optimizer config diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh index 8c85d39..3aa6216 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh @@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=4 +MICRO_BATCH_SIZE=2 GLOBAL_BATCH_SIZE=256 # optimizer config diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh index f833e8d..e5afab6 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh @@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=4 +MICRO_BATCH_SIZE=2 GLOBAL_BATCH_SIZE=256 # optimizer config diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh index bb48b38..fe513da 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh @@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=4 +MICRO_BATCH_SIZE=2 GLOBAL_BATCH_SIZE=256 # optimizer config From 8ee006a6d7dc27b67b407834731e13c32cef292d Mon Sep 17 00:00:00 2001 From: kazuki Date: Tue, 10 Sep 2024 16:26:42 +0900 Subject: [PATCH 21/44] fat: change eval interval --- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh | 2 +- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh | 2 +- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh | 2 +- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh | 2 +- src/llama_recipes/utils/instruction_tuning.py | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh index 1751716..f674a55 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh @@ -103,7 +103,7 @@ mpirun -np $NUM_GPUS \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ --save-interval 500 \ - --eval-interval 500 \ + --eval-interval 500000 \ --eval-iters 10 \ --bf16 \ --mixed-precision \ diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh index 3aa6216..edcc309 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh @@ -103,7 +103,7 @@ mpirun -np $NUM_GPUS \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ --save-interval 500 \ - --eval-interval 500 \ + --eval-interval 500000 \ --eval-iters 10 \ --bf16 \ --mixed-precision \ diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh index e5afab6..2fe0feb 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh @@ -103,7 +103,7 @@ mpirun -np $NUM_GPUS \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ --save-interval 500 \ - --eval-interval 500 \ + --eval-interval 500000 \ --eval-iters 10 \ --bf16 \ --mixed-precision \ diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh index fe513da..dd41a05 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh @@ -103,7 +103,7 @@ mpirun -np $NUM_GPUS \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ --save-interval 500 \ - --eval-interval 500 \ + --eval-interval 500000 \ --eval-iters 10 \ --bf16 \ --mixed-precision \ diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py index a707368..8d00fd6 100644 --- a/src/llama_recipes/utils/instruction_tuning.py +++ b/src/llama_recipes/utils/instruction_tuning.py @@ -22,7 +22,7 @@ def __init__( args = get_args() self.data_path: str = data_path - self.max_words: int = args.seq_length + self.max_tokens: int = args.seq_length self.tokenizer = tokenizer # system prompt @@ -93,10 +93,10 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: ) tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64) - if len(example) > self.max_words: + if len(example) > self.max_tokens: print(f"\n\nWARNING: example={self.tokenizer.decode(example)}\n\n") - padding_length: int = self.max_words - len(example) + padding_length: int = self.max_tokens - len(example) eos_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0] pad_token_id = eos_token_id if padding_length > 0: @@ -105,7 +105,7 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: ) tensor_example = torch.cat((tensor_example, pad_tensor)) elif padding_length < 0: - tensor_example = tensor_example[: self.max_words] + tensor_example = tensor_example[: self.max_tokens] labels = copy.deepcopy(tensor_example) # promptの長さ分だけ -1 で埋める -> 損失関数で無視するようになる From 1eecce4266d3eda152b34fb4f272105d2cc90ab5 Mon Sep 17 00:00:00 2001 From: kazuki Date: Tue, 10 Sep 2024 23:04:54 +0900 Subject: [PATCH 22/44] fix: instruction with next-token prediction --- src/llama_recipes/utils/instruction_tuning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py index 8d00fd6..48f91f2 100644 --- a/src/llama_recipes/utils/instruction_tuning.py +++ b/src/llama_recipes/utils/instruction_tuning.py @@ -68,7 +68,7 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: prompt = self.tokenizer.bos_token example = self.tokenizer.encode( conversations["content"], # type: ignore - add_special_tokens=False + add_special_tokens=True # text + (=) ) tensor_example = torch.tensor(example, dtype=torch.int64) else: From 8080e0742fe40feed4e96b4f7ea6ad49cc6eeb62 Mon Sep 17 00:00:00 2001 From: kazuki Date: Tue, 10 Sep 2024 23:22:17 +0900 Subject: [PATCH 23/44] chore: tools script (move directory) --- tools/dataset/{ => converter}/convert_conversation.py | 0 tools/dataset/{ => converter}/convert_dataset_dpo.py | 0 tools/dataset/{ => converter}/convert_dataset_instruct.py | 0 tools/dataset/{ => debug}/debug_chat_template.py | 2 +- tools/dataset/{ => debug}/debug_instruction.py | 0 5 files changed, 1 insertion(+), 1 deletion(-) rename tools/dataset/{ => converter}/convert_conversation.py (100%) rename tools/dataset/{ => converter}/convert_dataset_dpo.py (100%) rename tools/dataset/{ => converter}/convert_dataset_instruct.py (100%) rename tools/dataset/{ => debug}/debug_chat_template.py (98%) rename tools/dataset/{ => debug}/debug_instruction.py (100%) diff --git a/tools/dataset/convert_conversation.py b/tools/dataset/converter/convert_conversation.py similarity index 100% rename from tools/dataset/convert_conversation.py rename to tools/dataset/converter/convert_conversation.py diff --git a/tools/dataset/convert_dataset_dpo.py b/tools/dataset/converter/convert_dataset_dpo.py similarity index 100% rename from tools/dataset/convert_dataset_dpo.py rename to tools/dataset/converter/convert_dataset_dpo.py diff --git a/tools/dataset/convert_dataset_instruct.py b/tools/dataset/converter/convert_dataset_instruct.py similarity index 100% rename from tools/dataset/convert_dataset_instruct.py rename to tools/dataset/converter/convert_dataset_instruct.py diff --git a/tools/dataset/debug_chat_template.py b/tools/dataset/debug/debug_chat_template.py similarity index 98% rename from tools/dataset/debug_chat_template.py rename to tools/dataset/debug/debug_chat_template.py index 025b71e..20a79d1 100644 --- a/tools/dataset/debug_chat_template.py +++ b/tools/dataset/debug/debug_chat_template.py @@ -45,7 +45,7 @@ } ] -chat_template: str = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" +chat_template: str = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" # noqa: print("before apply chat template") diff --git a/tools/dataset/debug_instruction.py b/tools/dataset/debug/debug_instruction.py similarity index 100% rename from tools/dataset/debug_instruction.py rename to tools/dataset/debug/debug_instruction.py From c05d127d475d17ed4a98fbdb580ff9b0aafb75a9 Mon Sep 17 00:00:00 2001 From: kazuki Date: Wed, 11 Sep 2024 01:34:52 +0900 Subject: [PATCH 24/44] feat: dataset --- tools/dataset/filter/filter_gemma_magpie.py | 109 ++++++++++++++++++++ tools/dataset/lmsys_dataset.py | 1 + tools/dataset/merge_gemma_magpie.sh | 16 +++ 3 files changed, 126 insertions(+) create mode 100644 tools/dataset/filter/filter_gemma_magpie.py create mode 100644 tools/dataset/merge_gemma_magpie.sh diff --git a/tools/dataset/filter/filter_gemma_magpie.py b/tools/dataset/filter/filter_gemma_magpie.py new file mode 100644 index 0000000..9a48d99 --- /dev/null +++ b/tools/dataset/filter/filter_gemma_magpie.py @@ -0,0 +1,109 @@ +import argparse +import json +import sys +import random +import re + + +def is_empty_or_template(content): + content = content.strip() + return content in ("", "\n", "\n\n") or content in ("回答例:", "回答例;", "解答例:", "解答例;") + + +def clean_content_start(content): + # Remove leading ">\n\n" or ">\n\n\n" + content = re.sub(r"^>\n\n+", "", content) + # Remove leading asterisks + content = re.sub(r"^\s*\*+\s*", "", content) + return content + + +def clean_content_end(content): + # Remove leading newlines and spaces + content = content.lstrip("\n ") + + # Process the end of the content + lines = content.splitlines() + if lines: + # Clean the last line + last_line = lines[-1].rstrip() + # Remove trailing "**" if present + last_line = re.sub(r"\*+\s*$", "", last_line) + lines[-1] = last_line + + # Join the lines back together + content = "\n".join(lines) + + # Remove trailing asterisks followed by newline + content = re.sub(r"\*+\s*\n$", "\n", content) + + # Ensure the content ends with exactly one newline + content = content.rstrip() + "\n" + + return content + + +def process_jsonl(input_file, output_file): + processed_data = [] + seen_contents = set() + with open(input_file, "r") as infile: + for line in infile: + try: + data = json.loads(line) + + # Transform input + if "input" in data: + data["input"] = [data["input"]] + + # Clean input and output content + input_content = clean_content_end(clean_content_start(data["input"][0].get("content", ""))) + output_content = clean_content_end(clean_content_start(data.get("output", {}).get("content", ""))) + + # Check for empty or template content + if is_empty_or_template(input_content) or is_empty_or_template(output_content): + continue + + # Check for duplicates + content_pair = (input_content, output_content) + if content_pair in seen_contents: + continue + seen_contents.add(content_pair) + + # Update cleaned contents + data["input"][0]["content"] = input_content + data["output"]["content"] = output_content + + # add text section + data["text"] = "user: " + input_content + "\n" + "assistant: " + output_content + + processed_data.append(data) + + except json.JSONDecodeError: + print(f"Error decoding JSON: {line}", file=sys.stderr) + + # Shuffle the processed data + random.shuffle(processed_data) + + # Write the shuffled data to the output file + with open(output_file, "w") as outfile: + for data in processed_data: + json.dump(data, outfile) + outfile.write("\n") + + +def main(): + parser = argparse.ArgumentParser(description="Process and shuffle JSONL files") + parser.add_argument("--input", help="Input JSONL file") + parser.add_argument("--output", help="Output JSONL file") + parser.add_argument("--seed", type=int, help="Random seed for shuffling", default=123) + + args = parser.parse_args() + + if args.seed is not None: + random.seed(args.seed) + + process_jsonl(args.input, args.output) + + +if __name__ == "__main__": + main() diff --git a/tools/dataset/lmsys_dataset.py b/tools/dataset/lmsys_dataset.py index 4d2d725..7c7cb27 100644 --- a/tools/dataset/lmsys_dataset.py +++ b/tools/dataset/lmsys_dataset.py @@ -20,6 +20,7 @@ def process_sample(sample: dict[str, Any]) -> dict[str, Any] | None: "output": {"role": "assistant", "content": assistant_message["content"]}, "conversation": sample, "redacted": "NAME_" in user_message["content"] or "NAME_" in assistant_message["content"], + "text": "user: " + user_message["content"] + "\n\nassistant: " + assistant_message["content"] } return result diff --git a/tools/dataset/merge_gemma_magpie.sh b/tools/dataset/merge_gemma_magpie.sh new file mode 100644 index 0000000..afd6d5f --- /dev/null +++ b/tools/dataset/merge_gemma_magpie.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/MAGPIE/gemma2-27b-it + +mkdir -p $OUTPUT_DIR +rm $OUTPUT_DIR/merged.jsonl + +FILES=$(find /bb/llm/gaf51275/datasets/raw/instruct/MAGPIE/gemma2-27b-it -name "*.jsonl") + +MERGED_FILE=$OUTPUT_DIR/merged.jsonl + +for FILE in "${FILES[@]}"; do + cat $FILE >> $MERGED_FILE +done + +wc -l $MERGED_FILE From 8da4e779d1c177c1d729ad5317e278238dce50ee Mon Sep 17 00:00:00 2001 From: kazuki Date: Wed, 11 Sep 2024 21:56:29 +0900 Subject: [PATCH 25/44] feat: instruct tuning scripts --- .../Llama-3.1-8B-instruct-exp2-5.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp2-6.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp2-7.sh | 122 ++++++++++++++++++ 3 files changed, 366 insertions(+) create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh new file mode 100644 index 0000000..bd16373 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=2 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=5e-6 +MIN_LR=5e-7 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-5/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-5-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh new file mode 100644 index 0000000..b7840d0 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=2 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=5e-6 +MIN_LR=5e-7 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-6/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.5-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-6-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh new file mode 100644 index 0000000..508fd54 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=2 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=5e-6 +MIN_LR=5e-7 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-7/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25 + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-7-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" From eea3bf16b524cc025df81b81843d4cc81d8c2d1c Mon Sep 17 00:00:00 2001 From: kazuki Date: Thu, 12 Sep 2024 02:34:54 +0900 Subject: [PATCH 26/44] fix: instruction tuning --- src/llama_recipes/utils/instruction_tuning.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py index 48f91f2..062783b 100644 --- a/src/llama_recipes/utils/instruction_tuning.py +++ b/src/llama_recipes/utils/instruction_tuning.py @@ -64,12 +64,15 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: print(f"index={index}, offset={offset}, line={line}, error={e}") exit(1) + eod_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0] + if 'role' in conversations and conversations["role"] == "next_token_prediction": - prompt = self.tokenizer.bos_token + prompt = [self.tokenizer.bos_token_id] example = self.tokenizer.encode( conversations["content"], # type: ignore - add_special_tokens=True # text + (=) + add_special_tokens=True # text + ) + example += [eod_token_id] tensor_example = torch.tensor(example, dtype=torch.int64) else: SYSTEM_PROMPT: list[dict[str, str]] = [ @@ -81,24 +84,25 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: # chat template prompt = self.tokenizer.apply_chat_template( conversation=SYSTEM_PROMPT + conversations["input"], # type: ignore - add_generation_prompt=True, tokenize=True, ) example = self.tokenizer.apply_chat_template( conversation=SYSTEM_PROMPT + conversations["input"] + [ # type: ignore - {"role": "assistant", "content": conversations["output"]} + conversations["output"] ], tokenize=True, ) tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64) + # print(f"prompt: {self.tokenizer.decode(prompt, skip_special_tokens=False)}\n\nexample: {self.tokenizer.decode(example, skip_special_tokens=False)}", flush=True) + if len(example) > self.max_tokens: print(f"\n\nWARNING: example={self.tokenizer.decode(example)}\n\n") padding_length: int = self.max_tokens - len(example) - eos_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0] - pad_token_id = eos_token_id + pad_token_id: int = self.tokenizer.pad_token_id # type: ignore + assert pad_token_id is not None if padding_length > 0: pad_tensor = torch.full( (padding_length,), pad_token_id, dtype=torch.int64 From 5ab7b56a71a725ad1ffcfacad6410a1dd92fea7b Mon Sep 17 00:00:00 2001 From: kazuki Date: Thu, 12 Sep 2024 02:50:21 +0900 Subject: [PATCH 27/44] feat: instruction tuning script --- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh | 4 ++-- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh | 4 ++-- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh | 6 +++--- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh | 2 +- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh | 2 +- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh | 2 +- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh | 2 +- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh | 2 +- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh | 2 +- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh | 2 +- 10 files changed, 14 insertions(+), 14 deletions(-) diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh index 2db9f5c..1d6424d 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh @@ -1,5 +1,5 @@ #!/bin/bash -#$ -l rt_AF=1 +#$ -l rt_AF=8 #$ -l h_rt=1:00:00:00 #$ -j y #$ -o outputs/instruction/Llama-3.1-8B/ @@ -102,7 +102,7 @@ mpirun -np $NUM_GPUS \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ - --save-interval 500 \ + --save-interval 50000 \ --eval-interval 500 \ --eval-iters 10 \ --bf16 \ diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh index 612b1ac..4b7df07 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh @@ -1,5 +1,5 @@ #!/bin/bash -#$ -l rt_AF=1 +#$ -l rt_AF=8 #$ -l h_rt=1:00:00:00 #$ -j y #$ -o outputs/instruction/Llama-3.1-8B/ @@ -102,7 +102,7 @@ mpirun -np $NUM_GPUS \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ - --save-interval 500 \ + --save-interval 50000 \ --eval-interval 500 \ --eval-iters 10 \ --bf16 \ diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh index 127120f..6e47a3b 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh @@ -1,5 +1,5 @@ #!/bin/bash -#$ -l rt_AF=1 +#$ -l rt_AF=8 #$ -l h_rt=1:00:00:00 #$ -j y #$ -o outputs/instruction/Llama-3.1-8B/ @@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config -LR=2e-6 -MIN_LR=2e-7 +LR=1e-5 +MIN_LR=1e-6 WEIGHT_DECAY=0.1 GRAD_CLIP=1 diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh index f674a55..d4f6e7d 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh @@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=2 +MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh index edcc309..16eb573 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh @@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=2 +MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh index 2fe0feb..3306610 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh @@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=2 +MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh index dd41a05..d473880 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh @@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=2 +MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh index bd16373..a6bab02 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh @@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=2 +MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh index b7840d0..5add4f8 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh @@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=2 +MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh index 508fd54..226e89a 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh @@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" SEQ_LENGTH=8192 DATA_PARALLEL_SIZE=$NUM_GPUS -MICRO_BATCH_SIZE=2 +MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config From e48e11655399673d6780c946c7987fdfbbba0d12 Mon Sep 17 00:00:00 2001 From: kazuki Date: Thu, 12 Sep 2024 02:51:44 +0900 Subject: [PATCH 28/44] feat: instruction script --- tools/inference/inference.py | 23 ++++++++++++++------ tools/inference/inference_abci.sh | 35 +++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 tools/inference/inference_abci.sh diff --git a/tools/inference/inference.py b/tools/inference/inference.py index e0a1a58..2991255 100644 --- a/tools/inference/inference.py +++ b/tools/inference/inference.py @@ -9,6 +9,7 @@ parser.add_argument("--model-path", type=str) parser.add_argument("--tokenizer-path", type=str) parser.add_argument("--prompt", type=str, default=None) +parser.add_argument("--chat-template", action="store_true") args = parser.parse_args() @@ -22,11 +23,21 @@ device_map="auto", torch_dtype=torch.bfloat16 ) -input_ids: torch.Tensor = tokenizer.encode( # type: ignore - args.prompt, - add_special_tokens=False, - return_tensors="pt" -) +if args.chat_template: + input_ids = tokenizer.apply_chat_template( # type: ignore + [ + {"role": "system", "content": "あなたは誠実で優秀な日本人のアシスタントです。"}, + {"role": "user", "content": args.prompt}, + ], + tokenize=True, + return_tensors="pt" + ) +else: + input_ids: torch.Tensor = tokenizer.encode( # type: ignore + args.prompt, + add_special_tokens=False, + return_tensors="pt" + ) outputs = model.generate( # type: ignore input_ids.to(device=model.device), # type: ignore max_new_tokens=1024, @@ -35,5 +46,5 @@ do_sample=True, ) -generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) +generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False) print(generated_text) diff --git a/tools/inference/inference_abci.sh b/tools/inference/inference_abci.sh new file mode 100644 index 0000000..e9fbab9 --- /dev/null +++ b/tools/inference/inference_abci.sh @@ -0,0 +1,35 @@ +#!/bin/bash +#$ -l rt_AF=1 +#$ -l h_rt=0:01:00:00 +#$ -j y +#$ -o outputs/inference/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +set -e + +# swich virtual env +source .env/bin/activate + +INFERENCE_MODEL_DIR=/bb/llm/gaf51275/2024/checkpoints/pytorch-to-hf/Llama-3.1-8B-Instruct/exp2-1/LR_2.5e-5_MINLR_2.5e-6_WD_0.1_GC_1/iter_0005078 + +python tools/inference/inference.py \ + --model-path $INFERENCE_MODEL_DIR \ + --tokenizer-path $INFERENCE_MODEL_DIR \ + --prompt "Please explain Credit Default Swaps." \ + --chat-template + +python tools/inference/inference.py \ + --model-path $INFERENCE_MODEL_DIR \ + --tokenizer-path $INFERENCE_MODEL_DIR \ + --prompt "会社法について説明してください。" \ + --chat-template From d3e5c233d00014254506f377301bd95255f89130 Mon Sep 17 00:00:00 2001 From: kazuki Date: Thu, 12 Sep 2024 03:29:25 +0900 Subject: [PATCH 29/44] feat: instruction assertion --- src/llama_recipes/arguments.py | 1 + src/llama_recipes/utils/instruction_tuning.py | 50 +++++++++++++++---- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py index a2d2c48..e3c38fc 100644 --- a/src/llama_recipes/arguments.py +++ b/src/llama_recipes/arguments.py @@ -339,6 +339,7 @@ def _add_instruction_tuning_args(parser: argparse.ArgumentParser) -> argparse.Ar group.add_argument( "--save-sampler-state", action="store_true", ) + group.add_argument("--instruct-debug", action="store_true") return parser diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py index 062783b..446e384 100644 --- a/src/llama_recipes/utils/instruction_tuning.py +++ b/src/llama_recipes/utils/instruction_tuning.py @@ -24,6 +24,7 @@ def __init__( self.data_path: str = data_path self.max_tokens: int = args.seq_length self.tokenizer = tokenizer + self.debug_mode = args.instruct_debug # system prompt self.system_prompt_role = args.system_prompt_role @@ -66,11 +67,10 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: eod_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0] - if 'role' in conversations and conversations["role"] == "next_token_prediction": + if "role" in conversations and conversations["role"] == "next_token_prediction": prompt = [self.tokenizer.bos_token_id] example = self.tokenizer.encode( - conversations["content"], # type: ignore - add_special_tokens=True # text + + conversations["content"], add_special_tokens=True # type: ignore # text + ) example += [eod_token_id] tensor_example = torch.tensor(example, dtype=torch.int64) @@ -88,14 +88,16 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: ) example = self.tokenizer.apply_chat_template( - conversation=SYSTEM_PROMPT + conversations["input"] + [ # type: ignore - conversations["output"] - ], + conversation=SYSTEM_PROMPT + conversations["input"] + [conversations["output"]], # type: ignore tokenize=True, ) tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64) - # print(f"prompt: {self.tokenizer.decode(prompt, skip_special_tokens=False)}\n\nexample: {self.tokenizer.decode(example, skip_special_tokens=False)}", flush=True) + if self.debug_mode: + print( + f"prompt: {self.tokenizer.decode(prompt, skip_special_tokens=False)}\n\nexample: {self.tokenizer.decode(example, skip_special_tokens=False)}\n\n", + flush=True, + ) if len(example) > self.max_tokens: print(f"\n\nWARNING: example={self.tokenizer.decode(example)}\n\n") @@ -104,9 +106,7 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: pad_token_id: int = self.tokenizer.pad_token_id # type: ignore assert pad_token_id is not None if padding_length > 0: - pad_tensor = torch.full( - (padding_length,), pad_token_id, dtype=torch.int64 - ) + pad_tensor = torch.full((padding_length,), pad_token_id, dtype=torch.int64) tensor_example = torch.cat((tensor_example, pad_tensor)) elif padding_length < 0: tensor_example = tensor_example[: self.max_tokens] @@ -126,6 +126,36 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]: # mask out pad token attention_mask = (tensor_example != pad_token_id).float() + # assert + if self.debug_mode: + # padding + pad_ignore_count = torch.sum((tensor_example == pad_token_id) & (labels == IGNORE_INDEX)).item() + assert ( + pad_ignore_count == padding_length + ), f"Number of IGNORE_INDEX due to padding ({pad_ignore_count}) does not match padding_length ({padding_length})" + + # prompt + non_pad_ignore_count = torch.sum( + (tensor_example != pad_token_id) & (labels == IGNORE_INDEX)).item() + assert non_pad_ignore_count == len( + prompt + ), f"Number of IGNORE_INDEX not due to padding ({non_pad_ignore_count}) does not match prompt length ({len(prompt)})" + + # labels' non ignore index + if "output" in conversations: + non_ignore_labels = labels[labels != IGNORE_INDEX] + + chat_template = [conversations["output"]] + expected_tokens = self.tokenizer.apply_chat_template( + chat_template, return_tensors="pt", tokenize=True # type: ignore + ).squeeze() # type: ignore + if expected_tokens[0] == self.tokenizer.bos_token_id: + expected_tokens = expected_tokens[1:] + + assert torch.all( + non_ignore_labels == expected_tokens + ), "Non-ignored labels do not match the tokenized last assistant message" + return { "input_ids": tensor_example, "labels": labels, From 1f536275d045c542a3d764657e4d05ccaa3ba87e Mon Sep 17 00:00:00 2001 From: kazuki Date: Thu, 12 Sep 2024 21:17:53 +0900 Subject: [PATCH 30/44] feat: checkpoint convert & inference script --- .../scripts/abci/convert_ckpt_instruct.sh | 4 ++-- tools/inference/inference_abci.sh | 20 ++++++++++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh index 25e532c..586dc12 100644 --- a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh +++ b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh @@ -1,5 +1,5 @@ #!/bin/bash -#$ -l rt_F=1 +#$ -l rt_AF=1 #$ -l h_rt=1:00:00 #$ -j y #$ -o outputs/convert/ckpt/ @@ -21,7 +21,7 @@ export HF_HOME="/groups/gag51395/.cache/huggigface" # swich virtual env source .env/bin/activate -CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-1/LR_1e-5_MINLR_1e-6_WD_0.1_GC_1 +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-1/LR_2.5e-5_MINLR_2.5e-6_WD_0.1_GC_1 LATEST_ITERATION=$(cat ${CHECKPOINT_DIR}/latest_iteration.txt) echo "LATEST_ITERATION=${LATEST_ITERATION}" diff --git a/tools/inference/inference_abci.sh b/tools/inference/inference_abci.sh index e9fbab9..3544d80 100644 --- a/tools/inference/inference_abci.sh +++ b/tools/inference/inference_abci.sh @@ -20,7 +20,7 @@ set -e # swich virtual env source .env/bin/activate -INFERENCE_MODEL_DIR=/bb/llm/gaf51275/2024/checkpoints/pytorch-to-hf/Llama-3.1-8B-Instruct/exp2-1/LR_2.5e-5_MINLR_2.5e-6_WD_0.1_GC_1/iter_0005078 +INFERENCE_MODEL_DIR=/bb/llm/gaf51275/2024/checkpoints/pytorch-to-hf/Llama-3.1-8B-Instruct/exp2-4/LR_1e-6_MINLR_1e-7_WD_0.1_GC_1/iter_0004000 python tools/inference/inference.py \ --model-path $INFERENCE_MODEL_DIR \ @@ -33,3 +33,21 @@ python tools/inference/inference.py \ --tokenizer-path $INFERENCE_MODEL_DIR \ --prompt "会社法について説明してください。" \ --chat-template + +python tools/inference/inference.py \ + --model-path $INFERENCE_MODEL_DIR \ + --tokenizer-path $INFERENCE_MODEL_DIR \ + --prompt "東京工業大学のキャンパスはどこにありますか?" \ + --chat-template + +python tools/inference/inference.py \ + --model-path $INFERENCE_MODEL_DIR \ + --tokenizer-path $INFERENCE_MODEL_DIR \ + --prompt "1+4+8の答えはいくつでしょうか?" \ + --chat-template + +python tools/inference/inference.py \ + --model-path $INFERENCE_MODEL_DIR \ + --tokenizer-path $INFERENCE_MODEL_DIR \ + --prompt "Pythonでデータ構造のUnionFindクラスを作成してください。" \ + --chat-template From 688afb437863deda9351a0e39b92ca0b1ddf137b Mon Sep 17 00:00:00 2001 From: kazuki Date: Thu, 12 Sep 2024 21:21:06 +0900 Subject: [PATCH 31/44] feat: dataset merge script --- tools/dataset/merge_dataset.sh | 53 ++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index dbabc69..646d1e9 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -2,9 +2,10 @@ set -e -INCLUDE_REDACTED=true +INCLUDE_REDACTED=false FILTERD_SCORE=7 -NEXT_TOKEN_PERCENT=0.25 +NEXT_TOKEN_PERCENT=0.5 +USE_OPEN_ASSISTANT=false OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT @@ -12,32 +13,42 @@ if $INCLUDE_REDACTED; then OUTPUT_DIR=$OUTPUT_DIR-redacted fi +if ! $USE_OPEN_ASSISTANT; then + OUTPUT_DIR=$OUTPUT_DIR-no-oasst +fi + mkdir -p $OUTPUT_DIR -FILES=( - "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_1.jsonl" - "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl" - "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl" - "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl" - "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl" -) +if $USE_OPEN_ASSISTANT; then + FILES=( + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_1.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl" + ) -MERGED_FILE=$OUTPUT_DIR/merged.jsonl + MERGED_FILE=$OUTPUT_DIR/merged.jsonl -for FILE in "${FILES[@]}"; do - cat $FILE >> $MERGED_FILE -done + for FILE in "${FILES[@]}"; do + cat $FILE >> $MERGED_FILE + done -# fileter -python tools/dataset/fileter.py \ - --input_file $MERGED_FILE \ - --output_file $OUTPUT_DIR/train.jsonl \ - --threshold $FILTERD_SCORE + # filter + python tools/dataset/fileter.py \ + --input_file $MERGED_FILE \ + --output_file $OUTPUT_DIR/train.jsonl \ + --threshold $FILTERD_SCORE -rm $MERGED_FILE + rm $MERGED_FILE -echo "Filtered open assistant data:" -wc -l $OUTPUT_DIR/train.jsonl + echo "Filtered open assistant data:" + wc -l $OUTPUT_DIR/train.jsonl +else + # Open Assistant データを使用しない場合は空のファイルを作成 + touch $OUTPUT_DIR/train.jsonl + echo "Skipped Open Assistant data processing." +fi if $INCLUDE_REDACTED; then LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train.jsonl From 9998ca3aefb349cadd74143e5db2f586232a4399 Mon Sep 17 00:00:00 2001 From: kazuki Date: Thu, 12 Sep 2024 21:21:36 +0900 Subject: [PATCH 32/44] feat: change LR 5E-6 -> 1E-5 --- .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh | 4 ++-- .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh | 4 ++-- .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh index a6bab02..b7498c2 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh @@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config -LR=5e-6 -MIN_LR=5e-7 +LR=1e-5 +MIN_LR=1e-6 WEIGHT_DECAY=0.1 GRAD_CLIP=1 diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh index 5add4f8..8dd8d21 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh @@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config -LR=5e-6 -MIN_LR=5e-7 +LR=1e-5 +MIN_LR=1e-6 WEIGHT_DECAY=0.1 GRAD_CLIP=1 diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh index 226e89a..dedc27f 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh @@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config -LR=5e-6 -MIN_LR=5e-7 +LR=1e-5 +MIN_LR=1e-6 WEIGHT_DECAY=0.1 GRAD_CLIP=1 From ff6f3647d0754b363efa71995d7b12117b7440e5 Mon Sep 17 00:00:00 2001 From: kazuki Date: Thu, 12 Sep 2024 22:39:35 +0900 Subject: [PATCH 33/44] feat: Llama-3.1-8B instruct --- .../Llama-3.1-8B-instruct-exp2-8.sh | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-8.sh diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-8.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-8.sh new file mode 100644 index 0000000..d5b8b7f --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-8.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-8/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-no-oasst + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-8-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" From 254cbec40622baabe8090d6dd77920ab4b2ad2f5 Mon Sep 17 00:00:00 2001 From: kazuki Date: Fri, 13 Sep 2024 00:03:49 +0900 Subject: [PATCH 34/44] feat: llama-3.1-8b instruct --- .../Llama-3.1-8B-instruct-exp2-10+.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp2-10.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp2-9.sh | 122 ++++++++++++++++++ tools/dataset/merge_dataset.sh | 27 ++-- 4 files changed, 384 insertions(+), 9 deletions(-) create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-9.sh diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh new file mode 100644 index 0000000..c9a08ab --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10+/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted-no-oasst-en-lmsys + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10+-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh new file mode 100644 index 0000000..cde3bf8 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted-no-oasst-en-lmsys + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-9.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-9.sh new file mode 100644 index 0000000..103eaf1 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-9.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=1e-5 +MIN_LR=1e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-9/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-en-oasst + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-9-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index 646d1e9..2d83b19 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -4,8 +4,9 @@ set -e INCLUDE_REDACTED=false FILTERD_SCORE=7 -NEXT_TOKEN_PERCENT=0.5 -USE_OPEN_ASSISTANT=false +NEXT_TOKEN_PERCENT=0.25 +USE_OPEN_ASSISTANT=true +USE_ONLY_ENGLISH_OPEN_ASSISTANT=true OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT @@ -15,18 +16,26 @@ fi if ! $USE_OPEN_ASSISTANT; then OUTPUT_DIR=$OUTPUT_DIR-no-oasst +elif $USE_ONLY_ENGLISH_OPEN_ASSISTANT; then + OUTPUT_DIR=$OUTPUT_DIR-en-oasst fi mkdir -p $OUTPUT_DIR if $USE_OPEN_ASSISTANT; then - FILES=( - "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_1.jsonl" - "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl" - "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl" - "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl" - "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl" - ) + if $USE_ONLY_ENGLISH_OPEN_ASSISTANT; then + FILES=( + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl" + ) + else + FILES=( + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_1.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl" + ) + fi MERGED_FILE=$OUTPUT_DIR/merged.jsonl From 85eabe1272eebfcad83ebf6575ba3a166971d33c Mon Sep 17 00:00:00 2001 From: kazuki Date: Fri, 13 Sep 2024 00:04:05 +0900 Subject: [PATCH 35/44] feat: dataset's merge script --- tools/dataset/merge_dataset.sh | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index 2d83b19..9c36ee0 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -2,11 +2,12 @@ set -e -INCLUDE_REDACTED=false +INCLUDE_REDACTED=true FILTERD_SCORE=7 NEXT_TOKEN_PERCENT=0.25 -USE_OPEN_ASSISTANT=true -USE_ONLY_ENGLISH_OPEN_ASSISTANT=true +USE_OPEN_ASSISTANT=false +USE_ONLY_ENGLISH_OPEN_ASSISTANT=false +USE_ENGLISH_LMSYS=true OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT @@ -20,6 +21,10 @@ elif $USE_ONLY_ENGLISH_OPEN_ASSISTANT; then OUTPUT_DIR=$OUTPUT_DIR-en-oasst fi +if $USE_ENGLISH_LMSYS; then + OUTPUT_DIR=$OUTPUT_DIR-en-lmsys +fi + mkdir -p $OUTPUT_DIR if $USE_OPEN_ASSISTANT; then @@ -37,7 +42,7 @@ if $USE_OPEN_ASSISTANT; then ) fi - MERGED_FILE=$OUTPUT_DIR/merged.jsonl + MERGED_FILE=$OUTPUT_DIR/merged_oasst.jsonl for FILE in "${FILES[@]}"; do cat $FILE >> $MERGED_FILE @@ -59,13 +64,21 @@ else echo "Skipped Open Assistant data processing." fi +# 日本語のLMSYSデータを常に使用 if $INCLUDE_REDACTED; then - LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train.jsonl + JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train.jsonl else - LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-no-redacted.jsonl + JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-no-redacted.jsonl fi -cat $LMSYS_FILE >> $OUTPUT_DIR/train.jsonl +cat $JA_LMSYS_FILE >> $OUTPUT_DIR/train.jsonl + +# 英語のLMSYSデータを追加でオプションとして使用 +if $USE_ENGLISH_LMSYS; then + EN_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-en.jsonl + cat $EN_LMSYS_FILE >> $OUTPUT_DIR/train.jsonl + echo "Added English LMSYS data" +fi INSTRUCTION_SAMPLES=$(wc -l $OUTPUT_DIR/train.jsonl | awk '{print $1}') NEXT_TOKEN_SAMPLES=$(echo "($INSTRUCTION_SAMPLES / (1 - $NEXT_TOKEN_PERCENT)) * $NEXT_TOKEN_PERCENT / 1" | bc) From 748e2f455990dfea9603611ad723d5983da2d075 Mon Sep 17 00:00:00 2001 From: kazuki Date: Fri, 13 Sep 2024 00:05:48 +0900 Subject: [PATCH 36/44] chore: update dataset path (for exp10+) --- .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh index c9a08ab..a0a3e84 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh @@ -67,7 +67,7 @@ CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp mkdir -p ${CHECKPOINT_SAVE_DIR} # dataset -DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted-no-oasst-en-lmsys +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted-no-oasst-en-lmsys TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl VALID_DATA_PATH=${DATASET_DIR}/train.jsonl From 72d1ddae0e78378584882e27e715beb1116ead24 Mon Sep 17 00:00:00 2001 From: kazuki Date: Sat, 14 Sep 2024 15:22:17 +0900 Subject: [PATCH 37/44] feat: instruct scripts --- .../Llama-3.1-8B-instruct-exp2-10+.sh | 4 +- ....sh => Llama-3.1-8B-instruct-exp2-10-1.sh} | 0 .../Llama-3.1-8B-instruct-exp2-10-2.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp2-10-3.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp2-13.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp2-14.sh | 122 ++++++++++++++++++ .../Llama-3.1-8B-instruct-exp2-5.sh | 6 +- .../Llama-3.1-8B-instruct-exp2-6.sh | 6 +- 8 files changed, 496 insertions(+), 8 deletions(-) rename scripts/abci/instruction/Llama-3.1-8B/{Llama-3.1-8B-instruct-exp2-10.sh => Llama-3.1-8B-instruct-exp2-10-1.sh} (100%) create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-2.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-13.sh create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-14.sh diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh index a0a3e84..2109b46 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh @@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config -LR=1e-5 -MIN_LR=1e-6 +LR=2.5e-5 +MIN_LR=2.5e-6 WEIGHT_DECAY=0.1 GRAD_CLIP=1 diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-1.sh similarity index 100% rename from scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh rename to scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-1.sh diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-2.sh new file mode 100644 index 0000000..b2fb469 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-2.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted-no-oasst-en-lmsys + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh new file mode 100644 index 0000000..dbfeed3 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=2:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10-3/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-no-oasst-en-lmsys + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-13.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-13.sh new file mode 100644 index 0000000..9fd9417 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-13.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=3.5e-5 +MIN_LR=3.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-13/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-13-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-14.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-14.sh new file mode 100644 index 0000000..1549fb1 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-14.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:00:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=5e-5 +MIN_LR=5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-14/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-14-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh index b7498c2..6ed7988 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh @@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config -LR=1e-5 -MIN_LR=1e-6 +LR=2.5e-5 +MIN_LR=2.5e-6 WEIGHT_DECAY=0.1 GRAD_CLIP=1 @@ -67,7 +67,7 @@ CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp mkdir -p ${CHECKPOINT_SAVE_DIR} # dataset -DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-redacted TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl VALID_DATA_PATH=${DATASET_DIR}/train.jsonl diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh index 8dd8d21..95fcea5 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh @@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=256 # optimizer config -LR=1e-5 -MIN_LR=1e-6 +LR=2.5e-5 +MIN_LR=2.5e-6 WEIGHT_DECAY=0.1 GRAD_CLIP=1 @@ -67,7 +67,7 @@ CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp mkdir -p ${CHECKPOINT_SAVE_DIR} # dataset -DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.5-redacted +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0.5-redacted TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl VALID_DATA_PATH=${DATASET_DIR}/train.jsonl From 28a63ccca42d164060d13e96375c50571d4fb5da Mon Sep 17 00:00:00 2001 From: kazuki Date: Sat, 14 Sep 2024 16:04:05 +0900 Subject: [PATCH 38/44] feat: update merge script --- tools/dataset/merge_dataset.sh | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index 9c36ee0..e80e9a6 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -2,9 +2,9 @@ set -e -INCLUDE_REDACTED=true -FILTERD_SCORE=7 -NEXT_TOKEN_PERCENT=0.25 +INCLUDE_REDACTED=false +FILTERD_SCORE=0 +NEXT_TOKEN_PERCENT=0 USE_OPEN_ASSISTANT=false USE_ONLY_ENGLISH_OPEN_ASSISTANT=false USE_ENGLISH_LMSYS=true @@ -68,14 +68,18 @@ fi if $INCLUDE_REDACTED; then JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train.jsonl else - JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-no-redacted.jsonl + JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-ja-no-redacted.jsonl fi cat $JA_LMSYS_FILE >> $OUTPUT_DIR/train.jsonl # 英語のLMSYSデータを追加でオプションとして使用 if $USE_ENGLISH_LMSYS; then - EN_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-en.jsonl + if $INCLUDE_REDACTED; then + EN_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-en.jsonl + else + EN_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-en-no-redacted.jsonl + fi cat $EN_LMSYS_FILE >> $OUTPUT_DIR/train.jsonl echo "Added English LMSYS data" fi From 156f12cab6c37118f60982a1dde08d66268ef147 Mon Sep 17 00:00:00 2001 From: kazuki Date: Sat, 14 Sep 2024 16:04:23 +0900 Subject: [PATCH 39/44] feat: change dataset path for exp2-10-3 --- .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh index dbfeed3..f7872b6 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh @@ -1,6 +1,6 @@ #!/bin/bash #$ -l rt_AF=8 -#$ -l h_rt=2:00:00:00 +#$ -l h_rt=1:16:00:00 #$ -j y #$ -o outputs/instruction/Llama-3.1-8B/ #$ -cwd From c4e85979f0e5d78685358414ab6db1053fe7341d Mon Sep 17 00:00:00 2001 From: kazuki Date: Sat, 14 Sep 2024 16:54:20 +0900 Subject: [PATCH 40/44] feat: update dataset merge script (for exp2-10-4) --- tools/dataset/merge_dataset.sh | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index e80e9a6..16534b7 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -2,27 +2,41 @@ set -e +# Control variables INCLUDE_REDACTED=false FILTERD_SCORE=0 NEXT_TOKEN_PERCENT=0 USE_OPEN_ASSISTANT=false USE_ONLY_ENGLISH_OPEN_ASSISTANT=false USE_ENGLISH_LMSYS=true +USE_MAGPIE_ULTRA=true +CUSTOM_OUTPUT_DIR="" -OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT +# Base output directory +BASE_OUTPUT_DIR="/bb/llm/gaf51275/datasets/raw/instruct/training" + +if [ -z "$CUSTOM_OUTPUT_DIR" ]; then + OUTPUT_DIR="$BASE_OUTPUT_DIR/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT" +else + OUTPUT_DIR="$CUSTOM_OUTPUT_DIR" +fi if $INCLUDE_REDACTED; then - OUTPUT_DIR=$OUTPUT_DIR-redacted + OUTPUT_DIR="${OUTPUT_DIR}-redacted" fi if ! $USE_OPEN_ASSISTANT; then - OUTPUT_DIR=$OUTPUT_DIR-no-oasst + OUTPUT_DIR="${OUTPUT_DIR}-no-oasst" elif $USE_ONLY_ENGLISH_OPEN_ASSISTANT; then - OUTPUT_DIR=$OUTPUT_DIR-en-oasst + OUTPUT_DIR="${OUTPUT_DIR}-en-oasst" fi if $USE_ENGLISH_LMSYS; then - OUTPUT_DIR=$OUTPUT_DIR-en-lmsys + OUTPUT_DIR="${OUTPUT_DIR}-en-lmsys" +fi + +if $USE_MAGPIE_ULTRA; then + OUTPUT_DIR="${OUTPUT_DIR}-magpie-ultra" fi mkdir -p $OUTPUT_DIR @@ -84,6 +98,13 @@ if $USE_ENGLISH_LMSYS; then echo "Added English LMSYS data" fi +# Add magpie-ultra dataset processing +if $USE_MAGPIE_ULTRA; then + MAGPIE_ULTRA_FILE=/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/train.jsonl + cat $MAGPIE_ULTRA_FILE >> $OUTPUT_DIR/train.jsonl + echo "Added magpie-ultra data" +fi + INSTRUCTION_SAMPLES=$(wc -l $OUTPUT_DIR/train.jsonl | awk '{print $1}') NEXT_TOKEN_SAMPLES=$(echo "($INSTRUCTION_SAMPLES / (1 - $NEXT_TOKEN_PERCENT)) * $NEXT_TOKEN_PERCENT / 1" | bc) From 2c622153f0185bd256e3860a51567908f117b7d0 Mon Sep 17 00:00:00 2001 From: kazuki Date: Sat, 14 Sep 2024 16:54:29 +0900 Subject: [PATCH 41/44] feat: exp2-10-4 --- .../Llama-3.1-8B-instruct-exp2-10-4.sh | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh new file mode 100644 index 0000000..2661f9f --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=8 +#$ -l h_rt=1:16:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10-4/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-no-oasst-en-lmsys-magpie-ultra + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-4-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" From 9ce970ee6da258c93b4e55e83fb2c50bd18ff5ab Mon Sep 17 00:00:00 2001 From: kazuki Date: Sat, 14 Sep 2024 16:54:44 +0900 Subject: [PATCH 42/44] feat: implement for magpie-ultra --- .../dataset/converter/convert_magpie_ultra.py | 85 +++++++++++++++++++ tools/dataset/merge_magpie_ultra.sh | 23 +++++ 2 files changed, 108 insertions(+) create mode 100644 tools/dataset/converter/convert_magpie_ultra.py create mode 100644 tools/dataset/merge_magpie_ultra.sh diff --git a/tools/dataset/converter/convert_magpie_ultra.py b/tools/dataset/converter/convert_magpie_ultra.py new file mode 100644 index 0000000..8e64b48 --- /dev/null +++ b/tools/dataset/converter/convert_magpie_ultra.py @@ -0,0 +1,85 @@ +import argparse +import json +import sys + + +def process_input(input_file): + data = [] + with open(input_file, "r", encoding="utf-8") as f: + for line in f: + try: + item = json.loads(line.strip()) + data.append(item) + except json.JSONDecodeError as e: + print(f"Warning: Skipping invalid JSON line: {line.strip()}", file=sys.stderr) + return data + + +def convert_to_output(input_data, include_english: bool = False): + output_data = [] + for item in input_data: + if item.get("quality") in ["average", "good", "excellent"]: + output_item = { + "input": [ + { + "role": "user", + "content": item["processed_translated_instruction"] + } + ], + "output": { + "role": "assistant", + "content": item["processed_translated_response"] + }, + "quality": item["quality"], + "primary_tag": item["primary_tag"], + } + output_data.append(output_item) + if include_english: + en_output_item = { + "input": [ + { + "role": "user", + "content": item["instruction"], + } + ], + "output": { + "role": "assistant", + "content": item["response"] + }, + "quality": item["quality"], + "primary_tag": item["primary_tag"], + } + output_data.append(en_output_item) + + return output_data + + +def save_output(output_data, output_file): + with open(output_file, "w", encoding="utf-8") as f: + for item in output_data: + json.dump(item, f, ensure_ascii=False) + f.write("\n") + + +def main(): + parser = argparse.ArgumentParser(description="Convert input JSONL to output JSONL") + parser.add_argument("--input", required=True, help="Input JSONL file path") + parser.add_argument("--output", required=True, help="Output JSONL file path") + parser.add_argument("--include-english", action="store_true") + + args = parser.parse_args() + + try: + input_data = process_input(args.input) + output_data = convert_to_output( + input_data=input_data, include_english=args.include_english + ) + save_output(output_data, args.output) + print(f"Conversion completed. Output saved to {args.output}") + except Exception as e: + print(f"An error occurred: {str(e)}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tools/dataset/merge_magpie_ultra.sh b/tools/dataset/merge_magpie_ultra.sh new file mode 100644 index 0000000..7513078 --- /dev/null +++ b/tools/dataset/merge_magpie_ultra.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data + +mkdir -p $OUTPUT_DIR +rm $OUTPUT_DIR/merged.jsonl + +FILES=( + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00000-of-00002_1.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00000-of-00002_2.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00000-of-00002_3.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00001-of-00002_1.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00001-of-00002_2.jsonl" + "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00001-of-00002_3.jsonl" +) + +MERGED_FILE=$OUTPUT_DIR/merged.jsonl + +for FILE in "${FILES[@]}"; do + cat $FILE >> $MERGED_FILE +done + +wc -l $MERGED_FILE From 561f0f464c8b0dcd5adedabdec0ab9b2f5b50d9f Mon Sep 17 00:00:00 2001 From: kazuki Date: Sat, 14 Sep 2024 17:08:54 +0900 Subject: [PATCH 43/44] feat: exp2-10-5 --- .../Llama-3.1-8B-instruct-exp2-10-5.sh | 122 ++++++++++++++++++ tools/dataset/merge_dataset.sh | 12 ++ 2 files changed, 134 insertions(+) create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-5.sh diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-5.sh new file mode 100644 index 0000000..1f30b07 --- /dev/null +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-5.sh @@ -0,0 +1,122 @@ +#!/bin/bash +#$ -l rt_AF=16 +#$ -l h_rt=1:16:00:00 +#$ -j y +#$ -o outputs/instruction/Llama-3.1-8B/ +#$ -cwd + +# module load +source /etc/profile.d/modules.sh +module use /bb/llm/gaf51275/modules/modulefiles + +module load cuda/12.1/12.1.1 +module load cudnn/cuda-12.1/9.0.0 +module load nccl/2.20.5 +module load hpcx/2.12 +module load gcc/11.4.0 + +# swich virtual env +source .env/bin/activate + +# distributed settings +export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1) +export MASTER_PORT=$((10000 + ($JOB_ID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +# hostfile + +if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then + export NUM_GPU_PER_NODE=4 + NODE_TYPE="v100" +elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then + export NUM_GPU_PER_NODE=8 + NODE_TYPE="a100" +else + echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE" +fi + +NUM_NODES=$NHOSTS +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE})) + +mkdir -p ./hostfile + +HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID} +while read -r line; do + echo "${line} slots=${NUM_GPU_PER_NODE}" +done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME" + +# training config +SEQ_LENGTH=8192 +DATA_PARALLEL_SIZE=$NUM_GPUS + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=256 + +# optimizer config +LR=2.5e-5 +MIN_LR=2.5e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# checkpoint +TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct +CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500 +CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10-5/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}" + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# dataset +DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-no-oasst-en-lmsys-magpie-ultra-gemma-magpie + +TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl +VALID_DATA_PATH=${DATASET_DIR}/train.jsonl + +# job name +JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-5-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" + +# run +mpirun -np $NUM_GPUS \ + --npernode $NUM_GPU_PER_NODE \ + -hostfile $HOSTFILE_NAME \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -bind-to none \ + -x NCCL_IB_TIMEOUT=22 \ + -x LD_LIBRARY_PATH \ + -x PATH \ + python examples/finetuning.py \ + --seq-length ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --hf-transformer-model-dir ${TOKENIZER_DIR} \ + --instruction-train-data-path ${TRAIN_DATA_PATH} \ + --instruction-valid-data-path ${VALID_DATA_PATH} \ + --epoch 2 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --weight-decay ${WEIGHT_DECAY} \ + --grad-clip-norm ${GRAD_CLIP} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --save-interval 500 \ + --eval-interval 500000 \ + --eval-iters 10 \ + --bf16 \ + --mixed-precision \ + --base-model ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --load ${CHECKPOINT_SAVE_DIR} \ + --low-cpu-fsdp \ + --sharding-strategy FULL_SHARD \ + --checkpoint-type LOCAL_STATE_DICT \ + --fsdp-activation-checkpointing \ + --instruction-tuning \ + --save-sampler-state \ + --use-mpi \ + --wandb-entity "prj-jalm" \ + --wandb-project "Llama-3.1-8B-Instruct" \ + --wandb-name "${JOB_NAME}" diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh index 16534b7..b02410b 100644 --- a/tools/dataset/merge_dataset.sh +++ b/tools/dataset/merge_dataset.sh @@ -10,6 +10,7 @@ USE_OPEN_ASSISTANT=false USE_ONLY_ENGLISH_OPEN_ASSISTANT=false USE_ENGLISH_LMSYS=true USE_MAGPIE_ULTRA=true +USE_GEMMA_MAGPIE=true CUSTOM_OUTPUT_DIR="" # Base output directory @@ -39,6 +40,10 @@ if $USE_MAGPIE_ULTRA; then OUTPUT_DIR="${OUTPUT_DIR}-magpie-ultra" fi +if $USE_GEMMA_MAGPIE; then + OUTPUT_DIR="${OUTPUT_DIR}-gemma-magpie" +fi + mkdir -p $OUTPUT_DIR if $USE_OPEN_ASSISTANT; then @@ -105,6 +110,13 @@ if $USE_MAGPIE_ULTRA; then echo "Added magpie-ultra data" fi +# Add gemma-magpie dataset processing +if $USE_GEMMA_MAGPIE; then + GEMMA_MAGPIE_FILE=/bb/llm/gaf51275/datasets/raw/instruct/MAGPIE/gemma2-27b-it/format.jsonl + cat $GEMMA_MAGPIE_FILE >> $OUTPUT_DIR/train.jsonl + echo "Added gemma-magpie data" +fi + INSTRUCTION_SAMPLES=$(wc -l $OUTPUT_DIR/train.jsonl | awk '{print $1}') NEXT_TOKEN_SAMPLES=$(echo "($INSTRUCTION_SAMPLES / (1 - $NEXT_TOKEN_PERCENT)) * $NEXT_TOKEN_PERCENT / 1" | bc) From 3a9da08c09f3ad3de0e5bc7f3dc9c41fdbb886b8 Mon Sep 17 00:00:00 2001 From: kazuki Date: Mon, 16 Sep 2024 00:18:35 +0900 Subject: [PATCH 44/44] chore: update job running-time --- .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh | 4 ++-- .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh index 2661f9f..63810b5 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh @@ -1,6 +1,6 @@ #!/bin/bash -#$ -l rt_AF=8 -#$ -l h_rt=1:16:00:00 +#$ -l rt_AF=16 +#$ -l h_rt=1:8:00:00 #$ -j y #$ -o outputs/instruction/Llama-3.1-8B/ #$ -cwd diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh index 95fcea5..21b4b8e 100644 --- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh +++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh @@ -1,6 +1,6 @@ #!/bin/bash -#$ -l rt_AF=8 -#$ -l h_rt=2:00:00:00 +#$ -l rt_AF=16 +#$ -l h_rt=0:20:00:00 #$ -j y #$ -o outputs/instruction/Llama-3.1-8B/ #$ -cwd