From 58ce89f144b1b3979a65fc03bbabb9b128d985ba Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 2 Sep 2024 14:48:46 +0900
Subject: [PATCH 01/44] feat: distributed-timeout setting

---
 src/llama_recipes/arguments.py  | 4 ++++
 src/llama_recipes/finetuning.py | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py
index 80d872b..3241a61 100644
--- a/src/llama_recipes/arguments.py
+++ b/src/llama_recipes/arguments.py
@@ -67,6 +67,10 @@ def _add_fsdp_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     group.add_argument(
         "--use-dist-ckpt", action="store_true"
     )
+    group.add_argument(
+        '--distributed-timeout-minutes', type=int, default=10,
+        help='Timeout minutes for torch.distributed.'
+    )
 
     return parser
 
diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py
index 786b946..3b6094c 100644
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -69,7 +69,10 @@ def main() -> None:
     args.gradient_accumulation_steps = args.global_batch_size // (args.micro_batch_size * world_size)
     assert args.gradient_accumulation_steps >= 1
 
-    torch_distributed.init_process_group(backend="nccl", world_size=world_size, rank=rank)
+    timeout = timedelta(minutes=args.distributed_timeout_minutes)
+    torch_distributed.init_process_group(
+        backend="nccl", world_size=world_size, rank=rank, timeout=timeout,
+    )
 
     # wandb setting
     if args.wandb_name is not None and is_rank_0():

From 47a786d6c111089a3da8663fd33760161300c3f7 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 2 Sep 2024 15:50:04 +0900
Subject: [PATCH 02/44] feat: make anyprecision optimizer deprecated

---
 src/llama_recipes/arguments.py  |  2 +-
 src/llama_recipes/finetuning.py | 27 ++++++++-------------------
 2 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py
index 3241a61..0337321 100644
--- a/src/llama_recipes/arguments.py
+++ b/src/llama_recipes/arguments.py
@@ -208,7 +208,7 @@ def _add_training_args(parser: argparse.ArgumentParser) -> argparse.ArgumentPars
     # optimizer
     group.add_argument(
         '--optimizer', type=str, default='adam',
-        choices=['adam', 'anyprecision'],
+        choices=['adam'],
         help='Optimizer function'
     )
     group.add_argument(
diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py
index 3b6094c..d771c90 100644
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -1,6 +1,7 @@
 import copy
 import os
 import sys
+from datetime import timedelta
 
 import torch
 import torch.distributed as torch_distributed
@@ -258,25 +259,13 @@ def main() -> None:
         else:
             raise ValueError("unknown training mode")
 
-    if args.bf16 and args.optimizer == "anyprecision":
-        optimizer = AnyPrecisionAdamW(
-            model.parameters(),  # type: ignore
-            lr=args.lr,
-            betas=(args.adam_beta1, args.adam_beta2),
-            eps=args.adam_eps,
-            momentum_dtype=torch.bfloat16,
-            variance_dtype=torch.bfloat16,
-            use_kahan_summation=False,
-            weight_decay=args.weight_decay,
-        )
-    else:
-        optimizer = optim.AdamW(
-            model.parameters(),  # type: ignore
-            lr=args.lr,
-            betas=(args.adam_beta1, args.adam_beta2),
-            eps=args.adam_eps,
-            weight_decay=args.weight_decay,
-        )
+    optimizer = optim.AdamW(
+        model.parameters(),  # type: ignore
+        lr=args.lr,
+        betas=(args.adam_beta1, args.adam_beta2),
+        eps=args.adam_eps,
+        weight_decay=args.weight_decay,
+    )
 
     if args.load:
         if args.use_dist_ckpt:

From 85404b8000cf5d90845b59de28021bddc06a95a2 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 2 Sep 2024 15:52:02 +0900
Subject: [PATCH 03/44] fix: loss curve different bug when checkpoint load

---
 src/llama_recipes/utils/train_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py
index cff3b2d..daf3d24 100644
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -83,7 +83,7 @@ def train(
     if args.instruction_tuning or args.direct_preference_optimization:
         assert args.continual_pretraining is False
         print_rank_0(f"Skipping {iteration} batches")
-        for _ in range(iteration):
+        for _ in range(iteration * gradient_accumulation_steps):
             next(train_dataloader)
 
     while iteration < args.train_iters:

From 9d7f02326a1a85273e7dcdce8009cd3e604eeedc Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 2 Sep 2024 15:52:29 +0900
Subject: [PATCH 04/44] fix: make meta device load deprecated

---
 src/llama_recipes/get_models.py | 36 +++++++++------------------------
 1 file changed, 9 insertions(+), 27 deletions(-)

diff --git a/src/llama_recipes/get_models.py b/src/llama_recipes/get_models.py
index 05a9851..d1bdf78 100644
--- a/src/llama_recipes/get_models.py
+++ b/src/llama_recipes/get_models.py
@@ -34,33 +34,15 @@ def get_model(
         init_time = time.perf_counter()
 
     if "Llama" in model_name or "Swallow" in model_name:
-        if args.low_cpu_fsdp:
-            """
-            for FSDP, we can save cpu memory by loading pretrained model on rank0 only.
-            this avoids cpu oom when loading large models like llama 70B, in which case
-            model alone would consume 2+TB cpu mem (70 * 4 * 8). This will add some communications
-            overhead.
-            """
-            if is_rank_0():
-                model = LlamaForCausalLM.from_pretrained(
-                    model_name,
-                    load_in_8bit=True if args.quantization else None,
-                    device_map="auto" if args.quantization else None,
-                    use_cache=use_cache,
-                )
-            else:
-                llama_config = LlamaConfig.from_pretrained(model_name)
-                llama_config.use_cache = use_cache
-                with torch.device("meta"):
-                    model = LlamaForCausalLM(llama_config)
-
-        else:
-            model = LlamaForCausalLM.from_pretrained(
-                model_name,
-                load_in_8bit=True if args.quantization else None,
-                device_map="auto" if args.quantization else None,
-                use_cache=use_cache,
-            )
+        model = LlamaForCausalLM.from_pretrained(
+            model_name,
+            load_in_8bit=True if args.quantization else None,
+            device_map="auto" if args.quantization else None,
+            use_cache=use_cache,
+            max_position_embeddings=args.seq_length,
+            attn_implementation="flash_attention_2",
+            torch_dtype=torch.bfloat16 if args.bf16 else torch.float16,
+        )
 
     elif "Mistral" in model_name or "mistral" in model_name or "Codestral" in model_name:
         # If using torch.device("meta"), FSDP training hang

From 9f2dc2ff776886b396d53957f16d7c68a26c5537 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 2 Sep 2024 15:55:49 +0900
Subject: [PATCH 05/44] chore: update spell checker's ignored words

---
 .vscode/settings.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index bbadf97..c99ea13 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -47,6 +47,7 @@
     "pbar",
     "peft",
     "plamo",
+    "pretraining",
     "probs",
     "psutil",
     "pubmed",

From 5981e227a582493d2779e1bbdb69cd1e6e6f7454 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 2 Sep 2024 16:17:23 +0900
Subject: [PATCH 06/44] feat: torch profiler

---
 src/llama_recipes/arguments.py         | 33 ++++++++++++++++++++++++
 src/llama_recipes/utils/train_utils.py | 35 ++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py
index 0337321..2addf03 100644
--- a/src/llama_recipes/arguments.py
+++ b/src/llama_recipes/arguments.py
@@ -10,6 +10,7 @@ def parse_args() -> argparse.Namespace:
     parser = _add_training_args(parser=parser)
     parser = _add_regularization_args(parser=parser)
     parser = _add_instruction_tuning_args(parser=parser)
+    parser = _add_torch_profiler_args(parser=parser)
 
     args = parser.parse_args()
 
@@ -340,3 +341,35 @@ def _add_instruction_tuning_args(parser: argparse.ArgumentParser) -> argparse.Ar
     )
 
     return parser
+
+
+def _add_torch_profiler_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    group = parser.add_argument_group(title='torch profiler')
+
+    group.add_argument('--torch-profile', action='store_true', help='Enable torch profiler')
+    group.add_argument(
+        '--torch-profile-ranks', nargs='+', type=int, default=[0], help='Global ranks to profile'
+    )
+    group.add_argument('--torch-profile-wait', type=int, default=0, help='Steps to wait before profiling')
+    group.add_argument('--torch-profile-warmup', type=int, default=1, help='Warmup steps before profiling')
+    group.add_argument('--torch-profile-active', type=int, default=1, help='Steps to profile')
+    group.add_argument(
+        '--torch-profile-repeat', type=int, default=1, help='Repeat profiling this number of times'
+    )
+    group.add_argument(
+        '--torch-profile-skip-first', type=int, default=1,
+        help='Number of iterations to skip before profiling'
+    )
+    group.add_argument('--torch-profile-record-shapes', action='store_true',
+                       help='Save information about operator’s input shapes')
+    group.add_argument('--torch-profile-profile-memory', action='store_true',
+                       help='Track tensor memory allocation/deallocation')
+    group.add_argument('--torch-profile-with-stack', action='store_true',
+                       help='Record source information for the ops')
+    group.add_argument(
+        '--torch-profile-with-flops', action='store_true', help='Use formula to estimate the FLOPs'
+    )
+    group.add_argument('--torch-profile-with-modules', action='store_true', help='Record module hierarchy ')
+    group.add_argument('--tensorboard-dir', type=str, default=None)
+
+    return parser
diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py
index daf3d24..34c8534 100644
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -1,5 +1,6 @@
 import os
 import time
+import sys
 
 import torch
 import torch.cuda.nccl as nccl
@@ -86,6 +87,34 @@ def train(
         for _ in range(iteration * gradient_accumulation_steps):
             next(train_dataloader)
 
+    # profile
+    torch_profile_on = args.torch_profile and (
+        torch_distributed.get_rank() in args.torch_profile_ranks
+    )
+    if torch_profile_on:
+        profiler_context = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            schedule=torch.profiler.schedule(
+                wait=args.torch_profile_wait,
+                warmup=args.torch_profile_warmup,
+                active=args.torch_profile_active,
+                repeat=args.torch_profile_repeat,
+                skip_first=args.torch_profile_skip_first,
+            ),
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                args.tensorboard_dir, use_gzip=False
+            ),
+            record_shapes=args.torch_profile_record_shapes,
+            profile_memory=args.torch_profile_profile_memory,
+            with_stack=args.torch_profile_with_stack,
+            with_flops=args.torch_profile_with_flops,
+            with_modules=args.torch_profile_with_modules,
+        )
+        prof = profiler_context.__enter__()
+
     while iteration < args.train_iters:
         iteration_start_time = time.perf_counter()
 
@@ -242,6 +271,12 @@ def train(
                 iteration=iteration,
             )
 
+        # pytorch profiler
+        if torch_profile_on:
+            prof.step()
+
+    if torch_profile_on:
+        profiler_context.__exit__(*sys.exc_info())
     torch_distributed.barrier()
     save_checkpoint(
         model=model,  # type: ignore

From f81cbd4ca20c209035bd6c74ca81d49cec9e51ae Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Tue, 3 Sep 2024 00:08:05 +0900
Subject: [PATCH 07/44] feat: instruction tuning (best setting)

---
 .vscode/settings.json                          |  1 +
 .../Llama-3-8B/Llama-3-8B-instruct-v0.2.sh     |  8 ++++----
 src/llama_recipes/arguments.py                 | 18 ++++++++++++------
 src/llama_recipes/finetuning.py                |  7 +++++++
 src/llama_recipes/get_fsdp.py                  |  3 +++
 src/llama_recipes/utils/train_utils.py         |  2 +-
 6 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index c99ea13..19bc179 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -58,6 +58,7 @@
     "stabilityai",
     "stablelm",
     "stockmark",
+    "tensorboard",
     "tflops",
     "tobytes",
     "Xformer"
diff --git a/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh b/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh
index 4ebf185..7dca54d 100644
--- a/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh
+++ b/scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-#$ -l rt_AF=2
-#$ -l h_rt=0:01:00:00
+#$ -l rt_AF=1
+#$ -l h_rt=0:08:00:00
 #$ -j y
 #$ -o outputs/instruction/Llama-3-8B/
 #$ -cwd
@@ -82,7 +82,7 @@ mpirun -np $NUM_GPUS \
   -x MASTER_ADDR=$MASTER_ADDR \
   -x MASTER_PORT=$MASTER_PORT \
   -bind-to none \
-  -x PATH \
+  -x NCCL_IB_TIMEOUT=22 \
   -x LD_LIBRARY_PATH \
   -x PATH \
   python examples/finetuning.py \
@@ -102,7 +102,7 @@ mpirun -np $NUM_GPUS \
   --adam-beta1 0.9 \
   --adam-beta2 0.95 \
   --adam-eps 1e-8 \
-  --save-interval 500 \
+  --save-interval 10 \
   --eval-interval 500 \
   --eval-iters 10 \
   --bf16 \
diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py
index 2addf03..a2d2c48 100644
--- a/src/llama_recipes/arguments.py
+++ b/src/llama_recipes/arguments.py
@@ -360,12 +360,18 @@ def _add_torch_profiler_args(parser: argparse.ArgumentParser) -> argparse.Argume
         '--torch-profile-skip-first', type=int, default=1,
         help='Number of iterations to skip before profiling'
     )
-    group.add_argument('--torch-profile-record-shapes', action='store_true',
-                       help='Save information about operator’s input shapes')
-    group.add_argument('--torch-profile-profile-memory', action='store_true',
-                       help='Track tensor memory allocation/deallocation')
-    group.add_argument('--torch-profile-with-stack', action='store_true',
-                       help='Record source information for the ops')
+    group.add_argument(
+        '--torch-profile-record-shapes', action='store_true',
+        help='Save information about operator’s input shapes'
+    )
+    group.add_argument(
+        '--torch-profile-profile-memory', action='store_true',
+        help='Track tensor memory allocation/deallocation'
+    )
+    group.add_argument(
+        '--torch-profile-with-stack', action='store_true',
+        help='Record source information for the ops'
+    )
     group.add_argument(
         '--torch-profile-with-flops', action='store_true', help='Use formula to estimate the FLOPs'
     )
diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py
index d771c90..15ee388 100644
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -145,6 +145,9 @@ def main() -> None:
         model_name=args.base_model,
     )
 
+    from torch.distributed._tensor.device_mesh import init_device_mesh  # type: ignore
+    device_mesh = init_device_mesh(device_type="cuda", mesh_shape=(world_size, ))
+
     model = FSDP(
         model,  # type: ignore
         auto_wrap_policy=wrapping_policy,
@@ -159,8 +162,12 @@ def main() -> None:
         )
         if args.low_cpu_fsdp and rank != 0
         else None,
+        device_mesh=device_mesh,
     )
     if args.fsdp_activation_checkpointing:
+        # ref: https://github.com/meta-llama/llama-recipes/blob/778e31e35cfbe385a31b3a94b794e3f75e276d1a/src/llama_recipes/finetuning.py#L193-L195
+        # model.enable_input_require_grads()
+        # model.gradient_checkpointing_enable()
         apply_fsdp_checkpointing(model=model, model_name=args.base_model)
 
     if args.direct_preference_optimization:
diff --git a/src/llama_recipes/get_fsdp.py b/src/llama_recipes/get_fsdp.py
index 615dda2..4b7d1ca 100644
--- a/src/llama_recipes/get_fsdp.py
+++ b/src/llama_recipes/get_fsdp.py
@@ -12,6 +12,9 @@ def get_sharding_strategy() -> ShardingStrategy:
     elif args.sharding_strategy == "NO_SHARD":
         return ShardingStrategy.NO_SHARD
     elif args.sharding_strategy == "HYBRID_SHARD":
+        # TODO: https://pytorch.org/tutorials/recipes/distributed_device_mesh.html#how-to-use-devicemesh-with-hsdp
+        # support device mesh
+        # ref: https://github.com/meta-llama/llama-recipes/blob/778e31e35cfbe385a31b3a94b794e3f75e276d1a/src/llama_recipes/finetuning.py#L160
         return ShardingStrategy.HYBRID_SHARD
     elif args.sharding_strategy == "_HYBRID_SHARD_ZERO2":
         return ShardingStrategy._HYBRID_SHARD_ZERO2
diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py
index 34c8534..b6c13c7 100644
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -83,7 +83,7 @@ def train(
     # skip batch
     if args.instruction_tuning or args.direct_preference_optimization:
         assert args.continual_pretraining is False
-        print_rank_0(f"Skipping {iteration} batches")
+        print_rank_0(f"Skipping {iteration} iterations")
         for _ in range(iteration * gradient_accumulation_steps):
             next(train_dataloader)
 

From 5bd284e7812f6ab7bf972408c9e6328035d9d2f4 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sun, 8 Sep 2024 16:59:06 +0900
Subject: [PATCH 08/44] feat: dataset merge

---
 tools/dataset/fileter.py       | 46 ++++++++++++++++++++++++++++++++++
 tools/dataset/merge_dataset.sh | 29 ++++++++++++++-------
 2 files changed, 66 insertions(+), 9 deletions(-)
 create mode 100644 tools/dataset/fileter.py

diff --git a/tools/dataset/fileter.py b/tools/dataset/fileter.py
new file mode 100644
index 0000000..428722b
--- /dev/null
+++ b/tools/dataset/fileter.py
@@ -0,0 +1,46 @@
+import argparse
+import json
+from typing import List, Dict
+
+
+def process_jsonl(file_path: str, threshold: float) -> List[Dict]:
+    filtered_data = []
+    with open(file_path, "r") as file:
+        for line in file:
+            entry = json.loads(line)
+            if "overall" not in entry["scores"]:
+                continue
+
+            if entry["scores"]["overall"] >= threshold:
+                conversations = entry["conversations"]
+                # Get all messages except the last assistant message
+                input_messages = conversations[:-1]
+                assert len(conversations) % 2 == 0
+                # Get only the last assistant message
+                output_message = conversations[-1]
+                assert output_message["role"] == "assistant"
+                assert type(output_message) is dict
+                filtered_data.append({"input": input_messages, "output": output_message})
+    return filtered_data
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Filter JSONL file based on score threshold")
+    parser.add_argument("--input_file", type=str, help="Path to input JSONL file")
+    parser.add_argument("--output_file", type=str, help="Path to output JSONL file")
+    parser.add_argument("--threshold", type=int, default=4, help="Score threshold for filtering (default: 0.0)")
+
+    args = parser.parse_args()
+
+    filtered_data = process_jsonl(args.input_file, args.threshold)
+
+    with open(args.output_file, "w", encoding="utf-8") as outfile:
+        for entry in filtered_data:
+            json.dump(entry, outfile, ensure_ascii=False)
+            outfile.write("\n")
+
+    print(f"Processed data has been written to {args.output_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index 837c86c..046e275 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -1,17 +1,28 @@
 #!/bin/bash
 
-INPUT_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/formatted
-OUTPUT_DIR=/bb/llm/gaf51275/llama/finetuning/datasets/training/imitation_2_oasst2_top1
+INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja
+OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1
 
 mkdir -p $OUTPUT_DIR
 
-cat $INPUT_DIR/oasst1-21k-ja-mixtral-imitation_2.jsonl $INPUT_DIR/oasst2-top1-en.jsonl > $OUTPUT_DIR/merged.jsonl
+FILES=(
+  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_1.jsonl"
+  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl"
+  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl"
+  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl"
+  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/lm_scored.jsonl"
+)
 
-echo "Merged dataset is saved at $OUTPUT_DIR/merged.jsonl"
+MERGED_FILE=$OUTPUT_DIR/merged.jsonl
 
-# swich virtual env
-source .env/bin/activate
+for FILE in "${FILES[@]}"; do
+  cat $FILE >> $MERGED_FILE
+done
 
-python tools/dataset/shuffle_and_split.py \
-  --input $OUTPUT_DIR/merged.jsonl \
-  --output $OUTPUT_DIR
+# fileter
+python tools/dataset/fileter.py \
+  --input_file $MERGED_FILE \
+  --output_file $OUTPUT_DIR/train.jsonl \
+  --threshold 0
+
+rm $MERGED_FILE

From d9982b6fb067da2029356deb338e1bcb1753dddf Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sun, 8 Sep 2024 17:00:14 +0900
Subject: [PATCH 09/44] feat: exp1-3 dataset merge

---
 tools/dataset/merge_dataset.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index 046e275..9915741 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja
-OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1
+OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-3
 
 mkdir -p $OUTPUT_DIR
 
@@ -23,6 +23,10 @@ done
 python tools/dataset/fileter.py \
   --input_file $MERGED_FILE \
   --output_file $OUTPUT_DIR/train.jsonl \
-  --threshold 0
+  --threshold 4
 
 rm $MERGED_FILE
+
+echo "Done"
+
+wc -l $OUTPUT_DIR/train.jsonl

From dda376c68b7ac7896b08f0d5a8d0a2197f5c216e Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sun, 8 Sep 2024 17:03:27 +0900
Subject: [PATCH 10/44] feat: exp1-4 dataset merge

---
 tools/dataset/merge_dataset.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index 9915741..50e2884 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja
-OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-3
+OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-4
 
 mkdir -p $OUTPUT_DIR
 
@@ -23,7 +23,7 @@ done
 python tools/dataset/fileter.py \
   --input_file $MERGED_FILE \
   --output_file $OUTPUT_DIR/train.jsonl \
-  --threshold 4
+  --threshold 7
 
 rm $MERGED_FILE
 

From f198b1c86db73e424b831f52e4fab8a0e3f250e4 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sun, 8 Sep 2024 21:54:19 +0900
Subject: [PATCH 11/44] feat: Llama-3.1-instruct

---
 .../Llama-3.1-8B-instruct-exp1-1.sh           | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp1-2.sh           | 122 ++++++++++++++++++
 scripts/index.sh                              |  11 --
 tools/pre-process/scripts/index.sh            |  15 ++-
 4 files changed, 256 insertions(+), 14 deletions(-)
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh
 delete mode 100644 scripts/index.sh

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh
new file mode 100644
index 0000000..2db9f5c
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=1
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=1e-5
+MIN_LR=1e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-1/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-1.1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh
new file mode 100644
index 0000000..612b1ac
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=1
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=2e-6
+MIN_LR=2e-7
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-1.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/index.sh b/scripts/index.sh
deleted file mode 100644
index 9871629..0000000
--- a/scripts/index.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-#$ -l rt_F=1
-#$ -l h_rt=1:0:00:00
-#$ -j y
-#$ -o outputs/index/
-#$ -cwd
-
-# swich virtual env
-source .env/bin/activate
-
-python src/llama_recipes/datasets/index.py
diff --git a/tools/pre-process/scripts/index.sh b/tools/pre-process/scripts/index.sh
index 2a46f3f..16a3f84 100644
--- a/tools/pre-process/scripts/index.sh
+++ b/tools/pre-process/scripts/index.sh
@@ -2,8 +2,17 @@
 
 source .env/bin/activate
 
-INPUT_DIR=/gs/bs/tga-NII-LLM/datasets/raw/instruct/synthetic/general/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k
+INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1
 
-# baseline
 python tools/pre-process/index_dataset.py \
-  --data-file-path $INPUT_DIR/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k.jsonl
+  --data-file-path $INPUT_DIR/train.jsonl
+
+INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-3
+
+python tools/pre-process/index_dataset.py \
+  --data-file-path $INPUT_DIR/train.jsonl
+
+INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-4
+
+python tools/pre-process/index_dataset.py \
+  --data-file-path $INPUT_DIR/train.jsonl

From 13153798b568ef85203467617e969c275d48b97a Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sun, 8 Sep 2024 22:06:06 +0900
Subject: [PATCH 12/44] feat: Llama-3.1-8B instruct

---
 .../Llama-3.1-8B-instruct-exp1-3.sh           | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp1-4.sh           | 122 ++++++++++++++++++
 2 files changed, 244 insertions(+)
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-3.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-3.sh
new file mode 100644
index 0000000..972aec0
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-3.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=1
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=1e-5
+MIN_LR=1e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-3/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-3
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-1.3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh
new file mode 100644
index 0000000..127120f
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=1
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=2e-6
+MIN_LR=2e-7
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-4/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-4
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-1.4-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"

From 80347a88e4799983915daf2a0ba41096ef8b5e4d Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 9 Sep 2024 01:14:17 +0900
Subject: [PATCH 13/44] feat: dataset merge

---
 tools/dataset/extract_jsonl.py    | 65 ++++++++++++++++++++++
 tools/dataset/lmsys_dataset.py    | 90 +++++++++++++++++++++++++++++++
 tools/dataset/merge_dataset.sh    | 48 +++++++++++++++--
 tools/dataset/merge_next_token.sh | 15 ++++++
 4 files changed, 214 insertions(+), 4 deletions(-)
 create mode 100644 tools/dataset/extract_jsonl.py
 create mode 100644 tools/dataset/lmsys_dataset.py
 create mode 100644 tools/dataset/merge_next_token.sh

diff --git a/tools/dataset/extract_jsonl.py b/tools/dataset/extract_jsonl.py
new file mode 100644
index 0000000..164be17
--- /dev/null
+++ b/tools/dataset/extract_jsonl.py
@@ -0,0 +1,65 @@
+import argparse
+import json
+import random
+from pathlib import Path
+
+
+def count_lines(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        return sum(1 for _ in f)
+
+
+def extract_random_lines(input_path, output_path, num_lines):
+    total_lines = count_lines(input_path)
+
+    if num_lines >= total_lines:
+        print(
+            f"Warning: Requested {num_lines} lines, but file only contains {total_lines} lines. Extracting all lines."
+        )
+        num_lines = total_lines
+
+    selected_indices = set(random.sample(range(total_lines), num_lines))
+
+    with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
+        for i, line in enumerate(infile):
+            if i in selected_indices:
+                try:
+                    # Verify that the line is valid JSON
+                    json.loads(line.strip())
+                    outfile.write(line)
+                except json.JSONDecodeError:
+                    print(f"Warning: Invalid JSON on line {i+1}. Skipping.")
+                selected_indices.remove(i)
+            if not selected_indices:
+                break
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Extract specified number of random lines from a JSONL file.")
+    parser.add_argument("--input-path", required=True, help="Path to the input JSONL file")
+    parser.add_argument("--output-path", required=True, help="Path to the output JSONL file")
+    parser.add_argument("--num-lines", type=int, required=True, help="Number of lines to extract")
+    parser.add_argument("--seed", type=int, help="Random seed for reproducibility")
+
+    args = parser.parse_args()
+
+    input_path = Path(args.input_path)
+    output_path = Path(args.output_path)
+
+    if not input_path.exists():
+        print(f"Error: Input file '{input_path}' does not exist.")
+        return
+
+    if not input_path.is_file():
+        print(f"Error: '{input_path}' is not a file.")
+        return
+
+    if args.seed is not None:
+        random.seed(args.seed)
+
+    extract_random_lines(input_path, output_path, args.num_lines)
+    print(f"Extracted {args.num_lines} random lines from '{input_path}' to '{output_path}'.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/dataset/lmsys_dataset.py b/tools/dataset/lmsys_dataset.py
new file mode 100644
index 0000000..4d2d725
--- /dev/null
+++ b/tools/dataset/lmsys_dataset.py
@@ -0,0 +1,90 @@
+import argparse
+import json
+import hashlib
+from typing import Any
+
+
+def process_sample(sample: dict[str, Any]) -> dict[str, Any] | None:
+    conversation = sample.get("conversation", [])
+    if len(conversation) < 2:
+        return None
+
+    user_message = conversation[0]
+    assistant_message = conversation[1]
+
+    if not user_message.get("content") or not assistant_message.get("content"):
+        return None
+
+    result = {
+        "input": [{"role": "user", "content": user_message["content"]}],
+        "output": {"role": "assistant", "content": assistant_message["content"]},
+        "conversation": sample,
+        "redacted": "NAME_" in user_message["content"] or "NAME_" in assistant_message["content"],
+    }
+
+    return result
+
+
+def hash_sample(sample: dict[str, Any]) -> str:
+    return hashlib.md5(json.dumps(sample, sort_keys=True).encode()).hexdigest()
+
+
+def main(input_file: str, output_file: str, include_redacted: bool):
+    with open(input_file, "r", encoding="utf-8") as f:
+        data = [json.loads(line) for line in f]
+
+    processed_samples = []
+    hash_set = set()
+    invalid_count = 0
+    redacted_count = 0
+    non_redacted_count = 0
+
+    for sample in data:
+        processed = process_sample(sample)
+        if processed:
+            if processed["redacted"]:
+                redacted_count += 1
+                if include_redacted:
+                    sample_hash = hash_sample(processed)
+                    if sample_hash not in hash_set:
+                        hash_set.add(sample_hash)
+                        processed_samples.append(processed)
+                    else:
+                        print(f"Duplicate redacted sample found: {sample}")
+            else:
+                non_redacted_count += 1
+                sample_hash = hash_sample(processed)
+                if sample_hash not in hash_set:
+                    hash_set.add(sample_hash)
+                    processed_samples.append(processed)
+                else:
+                    print(f"Duplicate non-redacted sample found: {sample}")
+        else:
+            print(f"Invalid sample: {sample}")
+            invalid_count += 1
+
+    with open(output_file, "w", encoding="utf-8") as f:
+        for sample in processed_samples:
+            json.dump(sample, f, ensure_ascii=False)
+            f.write("\n")
+
+    print(f"Processed {len(processed_samples)} unique samples.")
+    print(f"Found {invalid_count} invalid samples.")
+    print(f"Total samples: {len(data)}")
+    print(f"Unique non-redacted samples: {non_redacted_count}")
+    print(f"Redacted samples: {redacted_count}")
+    if include_redacted:
+        print("Redacted samples included in output")
+    else:
+        print("Redacted samples not included in output")
+    print(f"Output written to {output_file}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert JSON to specified JSONL format")
+    parser.add_argument("--input-file", required=True, help="Input JSON file path")
+    parser.add_argument("--output-file", required=True, help="Output JSONL file path")
+    parser.add_argument("--include-redacted", action="store_true", help="Include redacted samples in output")
+    args = parser.parse_args()
+
+    main(args.input_file, args.output_file, args.include_redacted)
diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index 50e2884..9169b52 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -1,7 +1,16 @@
 #!/bin/bash
 
-INPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja
-OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-4
+set -e
+
+INCLUDE_REDACTED=true
+FILTERD_SCORE=7
+NEXT_TOKEN_PERCENT=0.25
+
+OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT
+
+if $INCLUDE_REDACTED; then
+  OUTPUT_DIR=$OUTPUT_DIR-redacted
+fi
 
 mkdir -p $OUTPUT_DIR
 
@@ -23,10 +32,41 @@ done
 python tools/dataset/fileter.py \
   --input_file $MERGED_FILE \
   --output_file $OUTPUT_DIR/train.jsonl \
-  --threshold 7
+  --threshold $FILTERD_SCORE
 
 rm $MERGED_FILE
 
-echo "Done"
+echo "Filtered open assistant data:"
+wc -l $OUTPUT_DIR/train.jsonl
+
+if $INCLUDE_REDACTED; then
+  LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train.jsonl
+else
+  LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-no-redacted.jsonl
+fi
+
+cat $LMSYS_FILE >> $OUTPUT_DIR/train.jsonl
+
+INSTRUCTION_SAMPLES=$(wc -l $OUTPUT_DIR/train.jsonl | awk '{print $1}')
+NEXT_TOKEN_SAMPLES=$(echo "$INSTRUCTION_SAMPLES * $NEXT_TOKEN_PERCENT / 1" | bc)
 
+python tools/dataset/extract_jsonl.py \
+  --input-path /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k/format/merged.jsonl \
+  --output-path $OUTPUT_DIR/next-token.jsonl \
+  --num-lines $NEXT_TOKEN_SAMPLES \
+  --seed 1234
+
+echo "Next token data:"
+wc -l $OUTPUT_DIR/next-token.jsonl
+
+cat $OUTPUT_DIR/next-token.jsonl >> $OUTPUT_DIR/train.jsonl
+
+echo "Total data:"
 wc -l $OUTPUT_DIR/train.jsonl
+
+rm $OUTPUT_DIR/next-token.jsonl
+
+# indexing
+
+python tools/pre-process/index_dataset.py \
+  --data-file-path $OUTPUT_DIR/train.jsonl
diff --git a/tools/dataset/merge_next_token.sh b/tools/dataset/merge_next_token.sh
new file mode 100644
index 0000000..4daff5e
--- /dev/null
+++ b/tools/dataset/merge_next_token.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k/format
+
+mkdir -p $OUTPUT_DIR
+
+FILES=$(find /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k -name "*.jsonl")
+
+MERGED_FILE=$OUTPUT_DIR/merged.jsonl
+
+for FILE in "${FILES[@]}"; do
+  cat $FILE >> $MERGED_FILE
+done
+
+wc -l $MERGED_FILE

From 721a1433de9099696c18112ac97b49bea6185568 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 9 Sep 2024 01:14:35 +0900
Subject: [PATCH 14/44] feat: next token prediction style

---
 src/llama_recipes/utils/instruction_tuning.py | 48 +++++++++++--------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py
index 1cb73aa..a707368 100644
--- a/src/llama_recipes/utils/instruction_tuning.py
+++ b/src/llama_recipes/utils/instruction_tuning.py
@@ -64,26 +64,34 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
                 print(f"index={index}, offset={offset}, line={line}, error={e}")
                 exit(1)
 
-        SYSTEM_PROMPT: list[dict[str, str]] = [
-            {
-                "role": self.system_prompt_role,
-                "content": self.system_prompt_content,
-            }
-        ]
-        # chat template
-        prompt = self.tokenizer.apply_chat_template(
-            conversation=SYSTEM_PROMPT + conversations["input"],  # type: ignore
-            add_generation_prompt=True,
-            tokenize=True,
-        )
-
-        example = self.tokenizer.apply_chat_template(
-            conversation=SYSTEM_PROMPT + conversations["input"] + [  # type: ignore
-                {"role": "assistant", "content": conversations["output"]}
-            ],
-            tokenize=True,
-        )
-        tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64)
+        if 'role' in conversations and conversations["role"] == "next_token_prediction":
+            prompt = self.tokenizer.bos_token
+            example = self.tokenizer.encode(
+                conversations["content"],  # type: ignore
+                add_special_tokens=False
+            )
+            tensor_example = torch.tensor(example, dtype=torch.int64)
+        else:
+            SYSTEM_PROMPT: list[dict[str, str]] = [
+                {
+                    "role": self.system_prompt_role,
+                    "content": self.system_prompt_content,
+                }
+            ]
+            # chat template
+            prompt = self.tokenizer.apply_chat_template(
+                conversation=SYSTEM_PROMPT + conversations["input"],  # type: ignore
+                add_generation_prompt=True,
+                tokenize=True,
+            )
+
+            example = self.tokenizer.apply_chat_template(
+                conversation=SYSTEM_PROMPT + conversations["input"] + [  # type: ignore
+                    {"role": "assistant", "content": conversations["output"]}
+                ],
+                tokenize=True,
+            )
+            tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64)
 
         if len(example) > self.max_words:
             print(f"\n\nWARNING: example={self.tokenizer.decode(example)}\n\n")

From 1a34e24456d27e7247ebc262a6baf2cf3cb5b763 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 9 Sep 2024 11:59:11 +0900
Subject: [PATCH 15/44] feat: lmsys-chat-1m job script

---
 .../Llama-3.1-8B-instruct-exp2-1.sh           | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp2-2.sh           | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp2-3.sh           | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp2-4.sh           | 122 ++++++++++++++++++
 4 files changed, 488 insertions(+)
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
new file mode 100644
index 0000000..2bfe779
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=4
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=2.5e-5
+MIN_LR=2.5e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-1/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
new file mode 100644
index 0000000..8c85d39
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=4
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=1e-5
+MIN_LR=1e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
new file mode 100644
index 0000000..f833e8d
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=4
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=5e-6
+MIN_LR=5e-7
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-3/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh
new file mode 100644
index 0000000..bb48b38
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=4
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=1e-6
+MIN_LR=1e-7
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-4/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-4-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"

From ecc287921978ea6fb07a8b64348b492fef1cbb55 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 9 Sep 2024 15:25:06 +0900
Subject: [PATCH 16/44] feat: checkpoint convert script

---
 tools/checkpoint-convert/convert_ckpt.py      | 21 ++++++---
 .../scripts/abci/convert_ckpt_instruct.sh     | 44 ++++++++++---------
 2 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/tools/checkpoint-convert/convert_ckpt.py b/tools/checkpoint-convert/convert_ckpt.py
index e8dcb32..be48a4c 100644
--- a/tools/checkpoint-convert/convert_ckpt.py
+++ b/tools/checkpoint-convert/convert_ckpt.py
@@ -1,35 +1,42 @@
 import argparse
 
 import torch
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--model", type=str, required=True, help="HuggingFace transformers model name"
+        "--hf-base-model-checkpoint-path", type=str,
+        required=True, help="HuggingFace transformers model name"
+    )
+    parser.add_argument("--hf-tokenizer-path", type=str, required=True)
+    parser.add_argument(
+        "--pytorch-model-checkpoint-path", type=str,
+        required=True, help="Path to checkpoint (`model.pth`)"
     )
-    parser.add_argument("--ckpt", type=str, required=True, help="Path to checkpoint (`model.pth`)")
     parser.add_argument("--out", type=str, required=True, help="Path to output directory")
     parser.add_argument("--sequence-length", type=int, required=True)
     args = parser.parse_args()
 
-    print(f"Loading HF model: {args.model}", flush=True)
+    print(f"Loading HF model: {args.hf_base_model_checkpoint_path}", flush=True)
     model = AutoModelForCausalLM.from_pretrained(
-        args.model,
+        args.hf_base_model_checkpoint_path,
         torch_dtype=torch.bfloat16,
         trust_remote_code=True,
         max_position_embeddings=args.sequence_length,
     )
+    tokenizer = AutoTokenizer.from_pretrained(args.hf_tokenizer_path)
 
-    print(f"Loading CKPT: {args.ckpt}", flush=True)
-    state_dict = torch.load(args.ckpt, map_location="cpu")
+    print(f"Loading CKPT: {args.pytorch_model_checkpoint_path}", flush=True)
+    state_dict = torch.load(args.pytorch_model_checkpoint_path, map_location="cpu")
 
     print("Loading state dict into HF model", flush=True)
     model.load_state_dict(state_dict)
 
     print("Saving HF model", flush=True)
     model.save_pretrained(args.out, safe_serialization=True)
+    tokenizer.save_pretrained(args.out)
 
 
 if __name__ == "__main__":
diff --git a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
index e54860b..7f601d4 100644
--- a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
+++ b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
@@ -20,32 +20,34 @@ set -e
 # swich virtual env
 source .env/bin/activate
 
-# distributed settings
-export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
-export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-1/LR_1e-5_MINLR_1e-6_WD_0.1_GC_1
+LATEST_ITERATION=$(cat ${CHECKPOINT_DIR}/latest_iteration.txt)
 
-echo "MASTER_ADDR=${MASTER_ADDR}"
+echo "LATEST_ITERATION=${LATEST_ITERATION}"
 
-start=578
-end=578
-increment=5000
+BASE_MODEL_CHECKPOINT=/bb/llm/gaf51275/hf-checkpoints/Meta-Llama-3.1-8B-Instruct
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+OUTPUT_DIR=/bb/llm/gaf51275/2024/checkpoints/pytorch-to-hf/Llama-3.1-8B-Instruct/
+EXTRACTED_PATH=$(echo $CHECKPOINT_DIR | awk -F'/Llama-3.1-8B-Instruct/' '{print $2}')
+OUTPUT_DIR="${OUTPUT_DIR}${EXTRACTED_PATH}"
 
-for ((i = start; i <= end; i += increment)); do
-  ITERATION=$i
-  FORMATTED_ITERATION=$(printf "iter_%07d" $ITERATION)
+echo "convert ${CHECKPOINT_DIR} to ${OUTPUT_DIR}"
+mkdir -p $OUTPUT_DIR
 
-  CHECK_POINT_PATH=/bb/llm/gaf51275/llama/checkpoints/Swallow-70b-VE-chat/oasst2-top1-imitation-2-3-lr_1e-5-minlr_1e-6-GB_256/${FORMATTED_ITERATION}/model.pt
-  OUTPUT_PATH=/bb/llm/gaf51275/llama/converted-hf-checkpoint/Swallow-70b-VE-chat/oasst2-top1-imitation-2-3-lr_1e-5-minlr_1e-6-GB_256/${FORMATTED_ITERATION}
+ITERATION=$LATEST_ITERATION
+FORMATTED_ITERATION=$(printf "iter_%07d" $ITERATION)
 
-  echo "convert ${CHECK_POINT_PATH} to ${OUTPUT_PATH}"
+CHECK_POINT_PATH=${CHECKPOINT_DIR}/${FORMATTED_ITERATION}/model.pt
+OUTPUT_PATH=${OUTPUT_DIR}/${FORMATTED_ITERATION}
 
-  mkdir -p $OUTPUT_PATH
+echo "convert ${CHECK_POINT_PATH} to ${OUTPUT_PATH}"
 
-  BASE_MODEL_CHECKPOINT=/bb/llm/gaf51275/llama/huggingface-checkpoint/Swallow-70b-hf
+mkdir -p $OUTPUT_PATH
 
-  python tools/checkpoint-convert/convert_ckpt.py \
-    --model $BASE_MODEL_CHECKPOINT \
-    --ckpt $CHECK_POINT_PATH \
-    --out $OUTPUT_PATH \
-    --sequence-length 4096
-done
+# convert
+python tools/checkpoint-convert/convert_ckpt.py \
+  --hf-base-model-checkpoint-path $BASE_MODEL_CHECKPOINT \
+  --hf-tokenizer-path $TOKENIZER_DIR \
+  --pytorch-model-checkpoint-path $CHECK_POINT_PATH \
+  --out $OUTPUT_PATH \
+  --sequence-length 8192

From 652faff5317129294a009b242c2b4009d4479121 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 9 Sep 2024 16:17:02 +0900
Subject: [PATCH 17/44] chore: update dataset merge script

---
 tools/dataset/merge_dataset.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index 9169b52..2715f64 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -3,8 +3,8 @@
 set -e
 
 INCLUDE_REDACTED=true
-FILTERD_SCORE=7
-NEXT_TOKEN_PERCENT=0.25
+FILTERD_SCORE=4
+NEXT_TOKEN_PERCENT=0.5
 
 OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT
 
@@ -48,7 +48,7 @@ fi
 cat $LMSYS_FILE >> $OUTPUT_DIR/train.jsonl
 
 INSTRUCTION_SAMPLES=$(wc -l $OUTPUT_DIR/train.jsonl | awk '{print $1}')
-NEXT_TOKEN_SAMPLES=$(echo "$INSTRUCTION_SAMPLES * $NEXT_TOKEN_PERCENT / 1" | bc)
+NEXT_TOKEN_SAMPLES=$(echo "($INSTRUCTION_SAMPLES / (1 - $NEXT_TOKEN_PERCENT)) * $NEXT_TOKEN_PERCENT / 1" | bc)
 
 python tools/dataset/extract_jsonl.py \
   --input-path /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k/format/merged.jsonl \

From 11d69183d1fbe581100817406667a22faabe3df5 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 9 Sep 2024 16:17:51 +0900
Subject: [PATCH 18/44] feat: instruct model upload script

---
 .../scripts/abci/convert_ckpt_instruct.sh     | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
index 7f601d4..25e532c 100644
--- a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
+++ b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
@@ -16,6 +16,7 @@ module load hpcx/2.12
 module load gcc/11.4.0
 
 set -e
+export HF_HOME="/groups/gag51395/.cache/huggigface"
 
 # swich virtual env
 source .env/bin/activate
@@ -51,3 +52,36 @@ python tools/checkpoint-convert/convert_ckpt.py \
   --pytorch-model-checkpoint-path $CHECK_POINT_PATH \
   --out $OUTPUT_PATH \
   --sequence-length 8192
+
+# upload
+upload_checkpoint() {
+  local upload_dir=$1
+  local repo_name=$2
+  local max_retries=5
+  local retry_count=0
+
+  while [ $retry_count -lt $max_retries ]; do
+    if python scripts/abci/upload/upload.py \
+        --ckpt-path "$upload_dir" \
+        --repo-name "$repo_name"; then
+        echo "Successfully uploaded $repo_name"
+        return 0
+    else
+        echo "Upload failed for $repo_name. Retrying..."
+        ((retry_count++))
+        sleep 5
+    fi
+  done
+
+  echo "Failed to upload $repo_name after $max_retries attempts"
+  return 1
+}
+
+EXP_NAME=$(echo $EXTRACTED_PATH | sed 's/\//-/g')
+HF_REPO_NAME="tokyotech-llm/Llama-3.1-8B-Instruct-${EXP_NAME}-${FORMATTED_ITERATION}"
+
+echo "upload ${OUTPUT_PATH} to ${HF_REPO_NAME}"
+
+if ! upload_checkpoint "$OUTPUT_PATH" "$HF_REPO_NAME"; then
+  echo "Skipping to next checkpoint after repeated failures for $HF_REPO_NAME"
+fi

From 09e48db75f78a0ca4395cbd114fb37f1be83138b Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Tue, 10 Sep 2024 13:38:11 +0900
Subject: [PATCH 19/44] feat: dataset merge & check

---
 tools/dataset/convert_conversation.py  | 40 ++++++++++++++++++++++++++
 tools/dataset/merge_dataset.sh         |  6 ++--
 tools/dataset/merge_next_token.sh      |  2 +-
 tools/dataset/next_token_prediciton.py | 24 ++++++++++++++++
 4 files changed, 68 insertions(+), 4 deletions(-)
 create mode 100644 tools/dataset/convert_conversation.py
 create mode 100644 tools/dataset/next_token_prediciton.py

diff --git a/tools/dataset/convert_conversation.py b/tools/dataset/convert_conversation.py
new file mode 100644
index 0000000..5ca72f8
--- /dev/null
+++ b/tools/dataset/convert_conversation.py
@@ -0,0 +1,40 @@
+import argparse
+import json
+import sys
+
+
+def process_jsonl(input_file, output_file):
+    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
+        for line in infile:
+            data = json.loads(line)
+
+            conversations = data.get("conversations", [])
+            assert len(conversations) >= 2
+
+            input_data = conversations[:-1]
+            output_data = conversations[-1]
+
+            data["input"] = input_data
+            data["output"] = output_data
+
+            json.dump(data, outfile)
+            outfile.write("\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Process JSONL data")
+    parser.add_argument("--input", help="Input JSONL file")
+    parser.add_argument("--output", help="Output JSONL file")
+
+    args = parser.parse_args()
+
+    try:
+        process_jsonl(args.input, args.output)
+        print(f"Processing complete. Output written to {args.output}")
+    except Exception as e:
+        print(f"An error occurred: {str(e)}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index 2715f64..dbabc69 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -3,8 +3,8 @@
 set -e
 
 INCLUDE_REDACTED=true
-FILTERD_SCORE=4
-NEXT_TOKEN_PERCENT=0.5
+FILTERD_SCORE=7
+NEXT_TOKEN_PERCENT=0.25
 
 OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT
 
@@ -19,7 +19,7 @@ FILES=(
   "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl"
   "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl"
   "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl"
-  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/lm_scored.jsonl"
+  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl"
 )
 
 MERGED_FILE=$OUTPUT_DIR/merged.jsonl
diff --git a/tools/dataset/merge_next_token.sh b/tools/dataset/merge_next_token.sh
index 4daff5e..72c43e3 100644
--- a/tools/dataset/merge_next_token.sh
+++ b/tools/dataset/merge_next_token.sh
@@ -4,7 +4,7 @@ OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-predicti
 
 mkdir -p $OUTPUT_DIR
 
-FILES=$(find /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k -name "*.jsonl")
+FILES=$(find /bb/llm/gaf51275/datasets/raw/instruct/next-token/next-token-prediction_500k/format -name "*.jsonl")
 
 MERGED_FILE=$OUTPUT_DIR/merged.jsonl
 
diff --git a/tools/dataset/next_token_prediciton.py b/tools/dataset/next_token_prediciton.py
new file mode 100644
index 0000000..903ce63
--- /dev/null
+++ b/tools/dataset/next_token_prediciton.py
@@ -0,0 +1,24 @@
+import argparse
+import json
+
+def check_jsonl_file(file_path):
+    with open(file_path, 'r') as file:
+        for line_number, line in enumerate(file, 1):
+            try:
+                json_obj = json.loads(line)
+                if json_obj.get('role') == 'next_token_prediction':
+                    pass
+                else:
+                    print(f"Line {line_number}: 'role': 'next_token_prediction' not found")
+            except json.JSONDecodeError:
+                print(f"Line {line_number}: Invalid JSON")
+
+def main():
+    parser = argparse.ArgumentParser(description="Check JSONL file for 'role': 'next_token_prediction'")
+    parser.add_argument('--file_path', help='Path to the JSONL file')
+    args = parser.parse_args()
+
+    check_jsonl_file(args.file_path)
+
+if __name__ == '__main__':
+    main()

From 8e352515fa4ac848292224675dfe837d8919efe2 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Tue, 10 Sep 2024 13:38:49 +0900
Subject: [PATCH 20/44] feat: Llama-3.1-8B Instruct exp2

---
 .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh    | 2 +-
 .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh    | 2 +-
 .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh    | 2 +-
 .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
index 2bfe779..1751716 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
@@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=4
+MICRO_BATCH_SIZE=2
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
index 8c85d39..3aa6216 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
@@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=4
+MICRO_BATCH_SIZE=2
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
index f833e8d..e5afab6 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
@@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=4
+MICRO_BATCH_SIZE=2
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh
index bb48b38..fe513da 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh
@@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=4
+MICRO_BATCH_SIZE=2
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config

From 8ee006a6d7dc27b67b407834731e13c32cef292d Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Tue, 10 Sep 2024 16:26:42 +0900
Subject: [PATCH 21/44] fat: change eval interval

---
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh          | 2 +-
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh          | 2 +-
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh          | 2 +-
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh          | 2 +-
 src/llama_recipes/utils/instruction_tuning.py             | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
index 1751716..f674a55 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
@@ -103,7 +103,7 @@ mpirun -np $NUM_GPUS \
   --adam-beta2 0.95 \
   --adam-eps 1e-8 \
   --save-interval 500 \
-  --eval-interval 500 \
+  --eval-interval 500000 \
   --eval-iters 10 \
   --bf16 \
   --mixed-precision \
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
index 3aa6216..edcc309 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
@@ -103,7 +103,7 @@ mpirun -np $NUM_GPUS \
   --adam-beta2 0.95 \
   --adam-eps 1e-8 \
   --save-interval 500 \
-  --eval-interval 500 \
+  --eval-interval 500000 \
   --eval-iters 10 \
   --bf16 \
   --mixed-precision \
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
index e5afab6..2fe0feb 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
@@ -103,7 +103,7 @@ mpirun -np $NUM_GPUS \
   --adam-beta2 0.95 \
   --adam-eps 1e-8 \
   --save-interval 500 \
-  --eval-interval 500 \
+  --eval-interval 500000 \
   --eval-iters 10 \
   --bf16 \
   --mixed-precision \
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh
index fe513da..dd41a05 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh
@@ -103,7 +103,7 @@ mpirun -np $NUM_GPUS \
   --adam-beta2 0.95 \
   --adam-eps 1e-8 \
   --save-interval 500 \
-  --eval-interval 500 \
+  --eval-interval 500000 \
   --eval-iters 10 \
   --bf16 \
   --mixed-precision \
diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py
index a707368..8d00fd6 100644
--- a/src/llama_recipes/utils/instruction_tuning.py
+++ b/src/llama_recipes/utils/instruction_tuning.py
@@ -22,7 +22,7 @@ def __init__(
         args = get_args()
 
         self.data_path: str = data_path
-        self.max_words: int = args.seq_length
+        self.max_tokens: int = args.seq_length
         self.tokenizer = tokenizer
 
         # system prompt
@@ -93,10 +93,10 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
             )
             tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64)
 
-        if len(example) > self.max_words:
+        if len(example) > self.max_tokens:
             print(f"\n\nWARNING: example={self.tokenizer.decode(example)}\n\n")
 
-        padding_length: int = self.max_words - len(example)
+        padding_length: int = self.max_tokens - len(example)
         eos_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0]
         pad_token_id = eos_token_id
         if padding_length > 0:
@@ -105,7 +105,7 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
             )
             tensor_example = torch.cat((tensor_example, pad_tensor))
         elif padding_length < 0:
-            tensor_example = tensor_example[: self.max_words]
+            tensor_example = tensor_example[: self.max_tokens]
 
         labels = copy.deepcopy(tensor_example)
         # promptの長さ分だけ -1 で埋める -> 損失関数で無視するようになる

From 1eecce4266d3eda152b34fb4f272105d2cc90ab5 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Tue, 10 Sep 2024 23:04:54 +0900
Subject: [PATCH 22/44] fix: instruction with next-token prediction

---
 src/llama_recipes/utils/instruction_tuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py
index 8d00fd6..48f91f2 100644
--- a/src/llama_recipes/utils/instruction_tuning.py
+++ b/src/llama_recipes/utils/instruction_tuning.py
@@ -68,7 +68,7 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
             prompt = self.tokenizer.bos_token
             example = self.tokenizer.encode(
                 conversations["content"],  # type: ignore
-                add_special_tokens=False
+                add_special_tokens=True  # <bos>text + <pad>(=<eos>)
             )
             tensor_example = torch.tensor(example, dtype=torch.int64)
         else:

From 8080e0742fe40feed4e96b4f7ea6ad49cc6eeb62 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Tue, 10 Sep 2024 23:22:17 +0900
Subject: [PATCH 23/44] chore: tools script (move directory)

---
 tools/dataset/{ => converter}/convert_conversation.py     | 0
 tools/dataset/{ => converter}/convert_dataset_dpo.py      | 0
 tools/dataset/{ => converter}/convert_dataset_instruct.py | 0
 tools/dataset/{ => debug}/debug_chat_template.py          | 2 +-
 tools/dataset/{ => debug}/debug_instruction.py            | 0
 5 files changed, 1 insertion(+), 1 deletion(-)
 rename tools/dataset/{ => converter}/convert_conversation.py (100%)
 rename tools/dataset/{ => converter}/convert_dataset_dpo.py (100%)
 rename tools/dataset/{ => converter}/convert_dataset_instruct.py (100%)
 rename tools/dataset/{ => debug}/debug_chat_template.py (98%)
 rename tools/dataset/{ => debug}/debug_instruction.py (100%)

diff --git a/tools/dataset/convert_conversation.py b/tools/dataset/converter/convert_conversation.py
similarity index 100%
rename from tools/dataset/convert_conversation.py
rename to tools/dataset/converter/convert_conversation.py
diff --git a/tools/dataset/convert_dataset_dpo.py b/tools/dataset/converter/convert_dataset_dpo.py
similarity index 100%
rename from tools/dataset/convert_dataset_dpo.py
rename to tools/dataset/converter/convert_dataset_dpo.py
diff --git a/tools/dataset/convert_dataset_instruct.py b/tools/dataset/converter/convert_dataset_instruct.py
similarity index 100%
rename from tools/dataset/convert_dataset_instruct.py
rename to tools/dataset/converter/convert_dataset_instruct.py
diff --git a/tools/dataset/debug_chat_template.py b/tools/dataset/debug/debug_chat_template.py
similarity index 98%
rename from tools/dataset/debug_chat_template.py
rename to tools/dataset/debug/debug_chat_template.py
index 025b71e..20a79d1 100644
--- a/tools/dataset/debug_chat_template.py
+++ b/tools/dataset/debug/debug_chat_template.py
@@ -45,7 +45,7 @@
     }
 ]
 
-chat_template: str = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+chat_template: str = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"  # noqa:
 
 print("before apply chat template")
 
diff --git a/tools/dataset/debug_instruction.py b/tools/dataset/debug/debug_instruction.py
similarity index 100%
rename from tools/dataset/debug_instruction.py
rename to tools/dataset/debug/debug_instruction.py

From c05d127d475d17ed4a98fbdb580ff9b0aafb75a9 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Wed, 11 Sep 2024 01:34:52 +0900
Subject: [PATCH 24/44] feat: dataset

---
 tools/dataset/filter/filter_gemma_magpie.py | 109 ++++++++++++++++++++
 tools/dataset/lmsys_dataset.py              |   1 +
 tools/dataset/merge_gemma_magpie.sh         |  16 +++
 3 files changed, 126 insertions(+)
 create mode 100644 tools/dataset/filter/filter_gemma_magpie.py
 create mode 100644 tools/dataset/merge_gemma_magpie.sh

diff --git a/tools/dataset/filter/filter_gemma_magpie.py b/tools/dataset/filter/filter_gemma_magpie.py
new file mode 100644
index 0000000..9a48d99
--- /dev/null
+++ b/tools/dataset/filter/filter_gemma_magpie.py
@@ -0,0 +1,109 @@
+import argparse
+import json
+import sys
+import random
+import re
+
+
+def is_empty_or_template(content):
+    content = content.strip()
+    return content in ("", "\n", "\n\n") or content in ("回答例:", "回答例；", "解答例:", "解答例；")
+
+
+def clean_content_start(content):
+    # Remove leading ">\n\n" or ">\n\n\n"
+    content = re.sub(r"^>\n\n+", "", content)
+    # Remove leading asterisks
+    content = re.sub(r"^\s*\*+\s*", "", content)
+    return content
+
+
+def clean_content_end(content):
+    # Remove leading newlines and spaces
+    content = content.lstrip("\n ")
+
+    # Process the end of the content
+    lines = content.splitlines()
+    if lines:
+        # Clean the last line
+        last_line = lines[-1].rstrip()
+        # Remove trailing "**" if present
+        last_line = re.sub(r"\*+\s*$", "", last_line)
+        lines[-1] = last_line
+
+    # Join the lines back together
+    content = "\n".join(lines)
+
+    # Remove trailing asterisks followed by newline
+    content = re.sub(r"\*+\s*\n$", "\n", content)
+
+    # Ensure the content ends with exactly one newline
+    content = content.rstrip() + "\n"
+
+    return content
+
+
+def process_jsonl(input_file, output_file):
+    processed_data = []
+    seen_contents = set()
+    with open(input_file, "r") as infile:
+        for line in infile:
+            try:
+                data = json.loads(line)
+
+                # Transform input
+                if "input" in data:
+                    data["input"] = [data["input"]]
+
+                # Clean input and output content
+                input_content = clean_content_end(clean_content_start(data["input"][0].get("content", "")))
+                output_content = clean_content_end(clean_content_start(data.get("output", {}).get("content", "")))
+
+                # Check for empty or template content
+                if is_empty_or_template(input_content) or is_empty_or_template(output_content):
+                    continue
+
+                # Check for duplicates
+                content_pair = (input_content, output_content)
+                if content_pair in seen_contents:
+                    continue
+                seen_contents.add(content_pair)
+
+                # Update cleaned contents
+                data["input"][0]["content"] = input_content
+                data["output"]["content"] = output_content
+
+                # add text section
+                data["text"] = "user: " + input_content + "\n" + "assistant: " + output_content
+
+                processed_data.append(data)
+
+            except json.JSONDecodeError:
+                print(f"Error decoding JSON: {line}", file=sys.stderr)
+
+    # Shuffle the processed data
+    random.shuffle(processed_data)
+
+    # Write the shuffled data to the output file
+    with open(output_file, "w") as outfile:
+        for data in processed_data:
+            json.dump(data, outfile)
+            outfile.write("\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Process and shuffle JSONL files")
+    parser.add_argument("--input", help="Input JSONL file")
+    parser.add_argument("--output", help="Output JSONL file")
+    parser.add_argument("--seed", type=int, help="Random seed for shuffling", default=123)
+
+    args = parser.parse_args()
+
+    if args.seed is not None:
+        random.seed(args.seed)
+
+    process_jsonl(args.input, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/dataset/lmsys_dataset.py b/tools/dataset/lmsys_dataset.py
index 4d2d725..7c7cb27 100644
--- a/tools/dataset/lmsys_dataset.py
+++ b/tools/dataset/lmsys_dataset.py
@@ -20,6 +20,7 @@ def process_sample(sample: dict[str, Any]) -> dict[str, Any] | None:
         "output": {"role": "assistant", "content": assistant_message["content"]},
         "conversation": sample,
         "redacted": "NAME_" in user_message["content"] or "NAME_" in assistant_message["content"],
+        "text": "user: " + user_message["content"] + "\n\nassistant: " + assistant_message["content"]
     }
 
     return result
diff --git a/tools/dataset/merge_gemma_magpie.sh b/tools/dataset/merge_gemma_magpie.sh
new file mode 100644
index 0000000..afd6d5f
--- /dev/null
+++ b/tools/dataset/merge_gemma_magpie.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/MAGPIE/gemma2-27b-it
+
+mkdir -p $OUTPUT_DIR
+rm $OUTPUT_DIR/merged.jsonl
+
+FILES=$(find /bb/llm/gaf51275/datasets/raw/instruct/MAGPIE/gemma2-27b-it -name "*.jsonl")
+
+MERGED_FILE=$OUTPUT_DIR/merged.jsonl
+
+for FILE in "${FILES[@]}"; do
+  cat $FILE >> $MERGED_FILE
+done
+
+wc -l $MERGED_FILE

From 8da4e779d1c177c1d729ad5317e278238dce50ee Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Wed, 11 Sep 2024 21:56:29 +0900
Subject: [PATCH 25/44] feat: instruct tuning scripts

---
 .../Llama-3.1-8B-instruct-exp2-5.sh           | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp2-6.sh           | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp2-7.sh           | 122 ++++++++++++++++++
 3 files changed, 366 insertions(+)
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
new file mode 100644
index 0000000..bd16373
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=2:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=2
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=5e-6
+MIN_LR=5e-7
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-5/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-5-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
new file mode 100644
index 0000000..b7840d0
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=2:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=2
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=5e-6
+MIN_LR=5e-7
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-6/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.5-redacted
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-6-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh
new file mode 100644
index 0000000..508fd54
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=2:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=2
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=5e-6
+MIN_LR=5e-7
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-7/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-7-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"

From eea3bf16b524cc025df81b81843d4cc81d8c2d1c Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Thu, 12 Sep 2024 02:34:54 +0900
Subject: [PATCH 26/44] fix: instruction tuning

---
 src/llama_recipes/utils/instruction_tuning.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py
index 48f91f2..062783b 100644
--- a/src/llama_recipes/utils/instruction_tuning.py
+++ b/src/llama_recipes/utils/instruction_tuning.py
@@ -64,12 +64,15 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
                 print(f"index={index}, offset={offset}, line={line}, error={e}")
                 exit(1)
 
+        eod_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0]
+
         if 'role' in conversations and conversations["role"] == "next_token_prediction":
-            prompt = self.tokenizer.bos_token
+            prompt = [self.tokenizer.bos_token_id]
             example = self.tokenizer.encode(
                 conversations["content"],  # type: ignore
-                add_special_tokens=True  # <bos>text + <pad>(=<eos>)
+                add_special_tokens=True  # <bos>text<eos> + <pad>
             )
+            example += [eod_token_id]
             tensor_example = torch.tensor(example, dtype=torch.int64)
         else:
             SYSTEM_PROMPT: list[dict[str, str]] = [
@@ -81,24 +84,25 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
             # chat template
             prompt = self.tokenizer.apply_chat_template(
                 conversation=SYSTEM_PROMPT + conversations["input"],  # type: ignore
-                add_generation_prompt=True,
                 tokenize=True,
             )
 
             example = self.tokenizer.apply_chat_template(
                 conversation=SYSTEM_PROMPT + conversations["input"] + [  # type: ignore
-                    {"role": "assistant", "content": conversations["output"]}
+                    conversations["output"]
                 ],
                 tokenize=True,
             )
             tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64)
 
+        # print(f"prompt: {self.tokenizer.decode(prompt, skip_special_tokens=False)}\n\nexample: {self.tokenizer.decode(example, skip_special_tokens=False)}", flush=True)
+
         if len(example) > self.max_tokens:
             print(f"\n\nWARNING: example={self.tokenizer.decode(example)}\n\n")
 
         padding_length: int = self.max_tokens - len(example)
-        eos_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0]
-        pad_token_id = eos_token_id
+        pad_token_id: int = self.tokenizer.pad_token_id  # type: ignore
+        assert pad_token_id is not None
         if padding_length > 0:
             pad_tensor = torch.full(
                 (padding_length,), pad_token_id, dtype=torch.int64

From 5ab7b56a71a725ad1ffcfacad6410a1dd92fea7b Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Thu, 12 Sep 2024 02:50:21 +0900
Subject: [PATCH 27/44] feat: instruction tuning script

---
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh            | 4 ++--
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh            | 4 ++--
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh            | 6 +++---
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh            | 2 +-
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh            | 2 +-
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh            | 2 +-
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh            | 2 +-
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh            | 2 +-
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh            | 2 +-
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh            | 2 +-
 10 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh
index 2db9f5c..1d6424d 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-1.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#$ -l rt_AF=1
+#$ -l rt_AF=8
 #$ -l h_rt=1:00:00:00
 #$ -j y
 #$ -o outputs/instruction/Llama-3.1-8B/
@@ -102,7 +102,7 @@ mpirun -np $NUM_GPUS \
   --adam-beta1 0.9 \
   --adam-beta2 0.95 \
   --adam-eps 1e-8 \
-  --save-interval 500 \
+  --save-interval 50000 \
   --eval-interval 500 \
   --eval-iters 10 \
   --bf16 \
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh
index 612b1ac..4b7df07 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-2.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#$ -l rt_AF=1
+#$ -l rt_AF=8
 #$ -l h_rt=1:00:00:00
 #$ -j y
 #$ -o outputs/instruction/Llama-3.1-8B/
@@ -102,7 +102,7 @@ mpirun -np $NUM_GPUS \
   --adam-beta1 0.9 \
   --adam-beta2 0.95 \
   --adam-eps 1e-8 \
-  --save-interval 500 \
+  --save-interval 50000 \
   --eval-interval 500 \
   --eval-iters 10 \
   --bf16 \
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh
index 127120f..6e47a3b 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp1-4.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#$ -l rt_AF=1
+#$ -l rt_AF=8
 #$ -l h_rt=1:00:00:00
 #$ -j y
 #$ -o outputs/instruction/Llama-3.1-8B/
@@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
-LR=2e-6
-MIN_LR=2e-7
+LR=1e-5
+MIN_LR=1e-6
 WEIGHT_DECAY=0.1
 GRAD_CLIP=1
 
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
index f674a55..d4f6e7d 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-1.sh
@@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=2
+MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
index edcc309..16eb573 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-2.sh
@@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=2
+MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
index 2fe0feb..3306610 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-3.sh
@@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=2
+MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh
index dd41a05..d473880 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-4.sh
@@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=2
+MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
index bd16373..a6bab02 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
@@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=2
+MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
index b7840d0..5add4f8 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
@@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=2
+MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh
index 508fd54..226e89a 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh
@@ -50,7 +50,7 @@ done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
 SEQ_LENGTH=8192
 DATA_PARALLEL_SIZE=$NUM_GPUS
 
-MICRO_BATCH_SIZE=2
+MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config

From e48e11655399673d6780c946c7987fdfbbba0d12 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Thu, 12 Sep 2024 02:51:44 +0900
Subject: [PATCH 28/44] feat: instruction script

---
 tools/inference/inference.py      | 23 ++++++++++++++------
 tools/inference/inference_abci.sh | 35 +++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 6 deletions(-)
 create mode 100644 tools/inference/inference_abci.sh

diff --git a/tools/inference/inference.py b/tools/inference/inference.py
index e0a1a58..2991255 100644
--- a/tools/inference/inference.py
+++ b/tools/inference/inference.py
@@ -9,6 +9,7 @@
 parser.add_argument("--model-path", type=str)
 parser.add_argument("--tokenizer-path", type=str)
 parser.add_argument("--prompt", type=str, default=None)
+parser.add_argument("--chat-template", action="store_true")
 args = parser.parse_args()
 
 
@@ -22,11 +23,21 @@
     device_map="auto", torch_dtype=torch.bfloat16
 )
 
-input_ids: torch.Tensor = tokenizer.encode(  # type: ignore
-    args.prompt,
-    add_special_tokens=False,
-    return_tensors="pt"
-)
+if args.chat_template:
+    input_ids = tokenizer.apply_chat_template(  # type: ignore
+        [
+            {"role": "system", "content": "あなたは誠実で優秀な日本人のアシスタントです。"},
+            {"role": "user", "content": args.prompt},
+        ],
+        tokenize=True,
+        return_tensors="pt"
+    )
+else:
+    input_ids: torch.Tensor = tokenizer.encode(  # type: ignore
+        args.prompt,
+        add_special_tokens=False,
+        return_tensors="pt"
+    )
 outputs = model.generate(  # type: ignore
     input_ids.to(device=model.device),  # type: ignore
     max_new_tokens=1024,
@@ -35,5 +46,5 @@
     do_sample=True,
 )
 
-generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
 print(generated_text)
diff --git a/tools/inference/inference_abci.sh b/tools/inference/inference_abci.sh
new file mode 100644
index 0000000..e9fbab9
--- /dev/null
+++ b/tools/inference/inference_abci.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+#$ -l rt_AF=1
+#$ -l h_rt=0:01:00:00
+#$ -j y
+#$ -o outputs/inference/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+set -e
+
+# swich virtual env
+source .env/bin/activate
+
+INFERENCE_MODEL_DIR=/bb/llm/gaf51275/2024/checkpoints/pytorch-to-hf/Llama-3.1-8B-Instruct/exp2-1/LR_2.5e-5_MINLR_2.5e-6_WD_0.1_GC_1/iter_0005078
+
+python tools/inference/inference.py \
+  --model-path $INFERENCE_MODEL_DIR \
+  --tokenizer-path $INFERENCE_MODEL_DIR \
+  --prompt "Please explain Credit Default Swaps." \
+  --chat-template
+
+python tools/inference/inference.py \
+  --model-path $INFERENCE_MODEL_DIR \
+  --tokenizer-path $INFERENCE_MODEL_DIR \
+  --prompt "会社法について説明してください。" \
+  --chat-template

From d3e5c233d00014254506f377301bd95255f89130 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Thu, 12 Sep 2024 03:29:25 +0900
Subject: [PATCH 29/44] feat: instruction assertion

---
 src/llama_recipes/arguments.py                |  1 +
 src/llama_recipes/utils/instruction_tuning.py | 50 +++++++++++++++----
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/src/llama_recipes/arguments.py b/src/llama_recipes/arguments.py
index a2d2c48..e3c38fc 100644
--- a/src/llama_recipes/arguments.py
+++ b/src/llama_recipes/arguments.py
@@ -339,6 +339,7 @@ def _add_instruction_tuning_args(parser: argparse.ArgumentParser) -> argparse.Ar
     group.add_argument(
         "--save-sampler-state", action="store_true",
     )
+    group.add_argument("--instruct-debug", action="store_true")
 
     return parser
 
diff --git a/src/llama_recipes/utils/instruction_tuning.py b/src/llama_recipes/utils/instruction_tuning.py
index 062783b..446e384 100644
--- a/src/llama_recipes/utils/instruction_tuning.py
+++ b/src/llama_recipes/utils/instruction_tuning.py
@@ -24,6 +24,7 @@ def __init__(
         self.data_path: str = data_path
         self.max_tokens: int = args.seq_length
         self.tokenizer = tokenizer
+        self.debug_mode = args.instruct_debug
 
         # system prompt
         self.system_prompt_role = args.system_prompt_role
@@ -66,11 +67,10 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
 
         eod_token_id: int = self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0]
 
-        if 'role' in conversations and conversations["role"] == "next_token_prediction":
+        if "role" in conversations and conversations["role"] == "next_token_prediction":
             prompt = [self.tokenizer.bos_token_id]
             example = self.tokenizer.encode(
-                conversations["content"],  # type: ignore
-                add_special_tokens=True  # <bos>text<eos> + <pad>
+                conversations["content"], add_special_tokens=True  # type: ignore  # <bos>text<eos> + <pad>
             )
             example += [eod_token_id]
             tensor_example = torch.tensor(example, dtype=torch.int64)
@@ -88,14 +88,16 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
             )
 
             example = self.tokenizer.apply_chat_template(
-                conversation=SYSTEM_PROMPT + conversations["input"] + [  # type: ignore
-                    conversations["output"]
-                ],
+                conversation=SYSTEM_PROMPT + conversations["input"] + [conversations["output"]],  # type: ignore
                 tokenize=True,
             )
             tensor_example: torch.Tensor = torch.tensor(example, dtype=torch.int64)
 
-        # print(f"prompt: {self.tokenizer.decode(prompt, skip_special_tokens=False)}\n\nexample: {self.tokenizer.decode(example, skip_special_tokens=False)}", flush=True)
+        if self.debug_mode:
+            print(
+                f"prompt: {self.tokenizer.decode(prompt, skip_special_tokens=False)}\n\nexample: {self.tokenizer.decode(example, skip_special_tokens=False)}\n\n",
+                flush=True,
+            )
 
         if len(example) > self.max_tokens:
             print(f"\n\nWARNING: example={self.tokenizer.decode(example)}\n\n")
@@ -104,9 +106,7 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
         pad_token_id: int = self.tokenizer.pad_token_id  # type: ignore
         assert pad_token_id is not None
         if padding_length > 0:
-            pad_tensor = torch.full(
-                (padding_length,), pad_token_id, dtype=torch.int64
-            )
+            pad_tensor = torch.full((padding_length,), pad_token_id, dtype=torch.int64)
             tensor_example = torch.cat((tensor_example, pad_tensor))
         elif padding_length < 0:
             tensor_example = tensor_example[: self.max_tokens]
@@ -126,6 +126,36 @@ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
         # mask out pad token
         attention_mask = (tensor_example != pad_token_id).float()
 
+        # assert
+        if self.debug_mode:
+            # padding
+            pad_ignore_count = torch.sum((tensor_example == pad_token_id) & (labels == IGNORE_INDEX)).item()
+            assert (
+                pad_ignore_count == padding_length
+            ), f"Number of IGNORE_INDEX due to padding ({pad_ignore_count}) does not match padding_length ({padding_length})"
+
+            # prompt
+            non_pad_ignore_count = torch.sum(
+                (tensor_example != pad_token_id) & (labels == IGNORE_INDEX)).item()
+            assert non_pad_ignore_count == len(
+                prompt
+            ), f"Number of IGNORE_INDEX not due to padding ({non_pad_ignore_count}) does not match prompt length ({len(prompt)})"
+
+            # labels' non ignore index
+            if "output" in conversations:
+                non_ignore_labels = labels[labels != IGNORE_INDEX]
+
+                chat_template = [conversations["output"]]
+                expected_tokens = self.tokenizer.apply_chat_template(
+                    chat_template, return_tensors="pt", tokenize=True  # type: ignore
+                ).squeeze()  # type: ignore
+                if expected_tokens[0] == self.tokenizer.bos_token_id:
+                    expected_tokens = expected_tokens[1:]
+
+                assert torch.all(
+                    non_ignore_labels == expected_tokens
+                ), "Non-ignored labels do not match the tokenized last assistant message"
+
         return {
             "input_ids": tensor_example,
             "labels": labels,

From 1f536275d045c542a3d764657e4d05ccaa3ba87e Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Thu, 12 Sep 2024 21:17:53 +0900
Subject: [PATCH 30/44] feat: checkpoint convert & inference script

---
 .../scripts/abci/convert_ckpt_instruct.sh     |  4 ++--
 tools/inference/inference_abci.sh             | 20 ++++++++++++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
index 25e532c..586dc12 100644
--- a/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
+++ b/tools/checkpoint-convert/scripts/abci/convert_ckpt_instruct.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#$ -l rt_F=1
+#$ -l rt_AF=1
 #$ -l h_rt=1:00:00
 #$ -j y
 #$ -o outputs/convert/ckpt/
@@ -21,7 +21,7 @@ export HF_HOME="/groups/gag51395/.cache/huggigface"
 # swich virtual env
 source .env/bin/activate
 
-CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-1/LR_1e-5_MINLR_1e-6_WD_0.1_GC_1
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-1/LR_2.5e-5_MINLR_2.5e-6_WD_0.1_GC_1
 LATEST_ITERATION=$(cat ${CHECKPOINT_DIR}/latest_iteration.txt)
 
 echo "LATEST_ITERATION=${LATEST_ITERATION}"
diff --git a/tools/inference/inference_abci.sh b/tools/inference/inference_abci.sh
index e9fbab9..3544d80 100644
--- a/tools/inference/inference_abci.sh
+++ b/tools/inference/inference_abci.sh
@@ -20,7 +20,7 @@ set -e
 # swich virtual env
 source .env/bin/activate
 
-INFERENCE_MODEL_DIR=/bb/llm/gaf51275/2024/checkpoints/pytorch-to-hf/Llama-3.1-8B-Instruct/exp2-1/LR_2.5e-5_MINLR_2.5e-6_WD_0.1_GC_1/iter_0005078
+INFERENCE_MODEL_DIR=/bb/llm/gaf51275/2024/checkpoints/pytorch-to-hf/Llama-3.1-8B-Instruct/exp2-4/LR_1e-6_MINLR_1e-7_WD_0.1_GC_1/iter_0004000
 
 python tools/inference/inference.py \
   --model-path $INFERENCE_MODEL_DIR \
@@ -33,3 +33,21 @@ python tools/inference/inference.py \
   --tokenizer-path $INFERENCE_MODEL_DIR \
   --prompt "会社法について説明してください。" \
   --chat-template
+
+python tools/inference/inference.py \
+  --model-path $INFERENCE_MODEL_DIR \
+  --tokenizer-path $INFERENCE_MODEL_DIR \
+  --prompt "東京工業大学のキャンパスはどこにありますか？" \
+  --chat-template
+
+python tools/inference/inference.py \
+  --model-path $INFERENCE_MODEL_DIR \
+  --tokenizer-path $INFERENCE_MODEL_DIR \
+  --prompt "1+4+8の答えはいくつでしょうか？" \
+  --chat-template
+
+python tools/inference/inference.py \
+  --model-path $INFERENCE_MODEL_DIR \
+  --tokenizer-path $INFERENCE_MODEL_DIR \
+  --prompt "Pythonでデータ構造のUnionFindクラスを作成してください。" \
+  --chat-template

From 688afb437863deda9351a0e39b92ca0b1ddf137b Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Thu, 12 Sep 2024 21:21:06 +0900
Subject: [PATCH 31/44] feat: dataset merge script

---
 tools/dataset/merge_dataset.sh | 53 ++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index dbabc69..646d1e9 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -2,9 +2,10 @@
 
 set -e
 
-INCLUDE_REDACTED=true
+INCLUDE_REDACTED=false
 FILTERD_SCORE=7
-NEXT_TOKEN_PERCENT=0.25
+NEXT_TOKEN_PERCENT=0.5
+USE_OPEN_ASSISTANT=false
 
 OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT
 
@@ -12,32 +13,42 @@ if $INCLUDE_REDACTED; then
   OUTPUT_DIR=$OUTPUT_DIR-redacted
 fi
 
+if ! $USE_OPEN_ASSISTANT; then
+  OUTPUT_DIR=$OUTPUT_DIR-no-oasst
+fi
+
 mkdir -p $OUTPUT_DIR
 
-FILES=(
-  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_1.jsonl"
-  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl"
-  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl"
-  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl"
-  "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl"
-)
+if $USE_OPEN_ASSISTANT; then
+  FILES=(
+    "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_1.jsonl"
+    "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl"
+    "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl"
+    "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl"
+    "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl"
+  )
 
-MERGED_FILE=$OUTPUT_DIR/merged.jsonl
+  MERGED_FILE=$OUTPUT_DIR/merged.jsonl
 
-for FILE in "${FILES[@]}"; do
-  cat $FILE >> $MERGED_FILE
-done
+  for FILE in "${FILES[@]}"; do
+    cat $FILE >> $MERGED_FILE
+  done
 
-# fileter
-python tools/dataset/fileter.py \
-  --input_file $MERGED_FILE \
-  --output_file $OUTPUT_DIR/train.jsonl \
-  --threshold $FILTERD_SCORE
+  # filter
+  python tools/dataset/fileter.py \
+    --input_file $MERGED_FILE \
+    --output_file $OUTPUT_DIR/train.jsonl \
+    --threshold $FILTERD_SCORE
 
-rm $MERGED_FILE
+  rm $MERGED_FILE
 
-echo "Filtered open assistant data:"
-wc -l $OUTPUT_DIR/train.jsonl
+  echo "Filtered open assistant data:"
+  wc -l $OUTPUT_DIR/train.jsonl
+else
+  # Open Assistant データを使用しない場合は空のファイルを作成
+  touch $OUTPUT_DIR/train.jsonl
+  echo "Skipped Open Assistant data processing."
+fi
 
 if $INCLUDE_REDACTED; then
   LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train.jsonl

From 9998ca3aefb349cadd74143e5db2f586232a4399 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Thu, 12 Sep 2024 21:21:36 +0900
Subject: [PATCH 32/44] feat: change LR 5E-6 -> 1E-5

---
 .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh  | 4 ++--
 .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh  | 4 ++--
 .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
index a6bab02..b7498c2 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
@@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
-LR=5e-6
-MIN_LR=5e-7
+LR=1e-5
+MIN_LR=1e-6
 WEIGHT_DECAY=0.1
 GRAD_CLIP=1
 
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
index 5add4f8..8dd8d21 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
@@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
-LR=5e-6
-MIN_LR=5e-7
+LR=1e-5
+MIN_LR=1e-6
 WEIGHT_DECAY=0.1
 GRAD_CLIP=1
 
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh
index 226e89a..dedc27f 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-7.sh
@@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
-LR=5e-6
-MIN_LR=5e-7
+LR=1e-5
+MIN_LR=1e-6
 WEIGHT_DECAY=0.1
 GRAD_CLIP=1
 

From ff6f3647d0754b363efa71995d7b12117b7440e5 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Thu, 12 Sep 2024 22:39:35 +0900
Subject: [PATCH 33/44] feat: Llama-3.1-8B instruct

---
 .../Llama-3.1-8B-instruct-exp2-8.sh           | 122 ++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-8.sh

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-8.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-8.sh
new file mode 100644
index 0000000..d5b8b7f
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-8.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=2:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=1e-5
+MIN_LR=1e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-8/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-no-oasst
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-8-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"

From 254cbec40622baabe8090d6dd77920ab4b2ad2f5 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Fri, 13 Sep 2024 00:03:49 +0900
Subject: [PATCH 34/44] feat: llama-3.1-8b instruct

---
 .../Llama-3.1-8B-instruct-exp2-10+.sh         | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp2-10.sh          | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp2-9.sh           | 122 ++++++++++++++++++
 tools/dataset/merge_dataset.sh                |  27 ++--
 4 files changed, 384 insertions(+), 9 deletions(-)
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-9.sh

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh
new file mode 100644
index 0000000..c9a08ab
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=2:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=1e-5
+MIN_LR=1e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10+/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted-no-oasst-en-lmsys
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-10+-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh
new file mode 100644
index 0000000..cde3bf8
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=2:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=1e-5
+MIN_LR=1e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted-no-oasst-en-lmsys
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-9.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-9.sh
new file mode 100644
index 0000000..103eaf1
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-9.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=2:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=1e-5
+MIN_LR=1e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-9/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-en-oasst
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-9-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index 646d1e9..2d83b19 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -4,8 +4,9 @@ set -e
 
 INCLUDE_REDACTED=false
 FILTERD_SCORE=7
-NEXT_TOKEN_PERCENT=0.5
-USE_OPEN_ASSISTANT=false
+NEXT_TOKEN_PERCENT=0.25
+USE_OPEN_ASSISTANT=true
+USE_ONLY_ENGLISH_OPEN_ASSISTANT=true
 
 OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT
 
@@ -15,18 +16,26 @@ fi
 
 if ! $USE_OPEN_ASSISTANT; then
   OUTPUT_DIR=$OUTPUT_DIR-no-oasst
+elif $USE_ONLY_ENGLISH_OPEN_ASSISTANT; then
+  OUTPUT_DIR=$OUTPUT_DIR-en-oasst
 fi
 
 mkdir -p $OUTPUT_DIR
 
 if $USE_OPEN_ASSISTANT; then
-  FILES=(
-    "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_1.jsonl"
-    "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl"
-    "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl"
-    "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl"
-    "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl"
-  )
+  if $USE_ONLY_ENGLISH_OPEN_ASSISTANT; then
+    FILES=(
+      "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl"
+    )
+  else
+    FILES=(
+      "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_1.jsonl"
+      "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_2.jsonl"
+      "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_3.jsonl"
+      "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-33k-ja/lm_filtered_split_4.jsonl"
+      "/bb/llm/gaf51275/datasets/raw/instruct/general/oasst2-top1-en-chat-sft/data/train.jsonl"
+    )
+  fi
 
   MERGED_FILE=$OUTPUT_DIR/merged.jsonl
 

From 85eabe1272eebfcad83ebf6575ba3a166971d33c Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Fri, 13 Sep 2024 00:04:05 +0900
Subject: [PATCH 35/44] feat: dataset's merge script

---
 tools/dataset/merge_dataset.sh | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index 2d83b19..9c36ee0 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -2,11 +2,12 @@
 
 set -e
 
-INCLUDE_REDACTED=false
+INCLUDE_REDACTED=true
 FILTERD_SCORE=7
 NEXT_TOKEN_PERCENT=0.25
-USE_OPEN_ASSISTANT=true
-USE_ONLY_ENGLISH_OPEN_ASSISTANT=true
+USE_OPEN_ASSISTANT=false
+USE_ONLY_ENGLISH_OPEN_ASSISTANT=false
+USE_ENGLISH_LMSYS=true
 
 OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT
 
@@ -20,6 +21,10 @@ elif $USE_ONLY_ENGLISH_OPEN_ASSISTANT; then
   OUTPUT_DIR=$OUTPUT_DIR-en-oasst
 fi
 
+if $USE_ENGLISH_LMSYS; then
+  OUTPUT_DIR=$OUTPUT_DIR-en-lmsys
+fi
+
 mkdir -p $OUTPUT_DIR
 
 if $USE_OPEN_ASSISTANT; then
@@ -37,7 +42,7 @@ if $USE_OPEN_ASSISTANT; then
     )
   fi
 
-  MERGED_FILE=$OUTPUT_DIR/merged.jsonl
+  MERGED_FILE=$OUTPUT_DIR/merged_oasst.jsonl
 
   for FILE in "${FILES[@]}"; do
     cat $FILE >> $MERGED_FILE
@@ -59,13 +64,21 @@ else
   echo "Skipped Open Assistant data processing."
 fi
 
+# 日本語のLMSYSデータを常に使用
 if $INCLUDE_REDACTED; then
-  LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train.jsonl
+  JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train.jsonl
 else
-  LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-no-redacted.jsonl
+  JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-no-redacted.jsonl
 fi
 
-cat $LMSYS_FILE >> $OUTPUT_DIR/train.jsonl
+cat $JA_LMSYS_FILE >> $OUTPUT_DIR/train.jsonl
+
+# 英語のLMSYSデータを追加でオプションとして使用
+if $USE_ENGLISH_LMSYS; then
+  EN_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-en.jsonl
+  cat $EN_LMSYS_FILE >> $OUTPUT_DIR/train.jsonl
+  echo "Added English LMSYS data"
+fi
 
 INSTRUCTION_SAMPLES=$(wc -l $OUTPUT_DIR/train.jsonl | awk '{print $1}')
 NEXT_TOKEN_SAMPLES=$(echo "($INSTRUCTION_SAMPLES / (1 - $NEXT_TOKEN_PERCENT)) * $NEXT_TOKEN_PERCENT / 1" | bc)

From 748e2f455990dfea9603611ad723d5983da2d075 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Fri, 13 Sep 2024 00:05:48 +0900
Subject: [PATCH 36/44] chore: update dataset path (for exp10+)

---
 .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh
index c9a08ab..a0a3e84 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh
@@ -67,7 +67,7 @@ CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp
 mkdir -p ${CHECKPOINT_SAVE_DIR}
 
 # dataset
-DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted-no-oasst-en-lmsys
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted-no-oasst-en-lmsys
 
 TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
 VALID_DATA_PATH=${DATASET_DIR}/train.jsonl

From 72d1ddae0e78378584882e27e715beb1116ead24 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sat, 14 Sep 2024 15:22:17 +0900
Subject: [PATCH 37/44] feat: instruct scripts

---
 .../Llama-3.1-8B-instruct-exp2-10+.sh         |   4 +-
 ....sh => Llama-3.1-8B-instruct-exp2-10-1.sh} |   0
 .../Llama-3.1-8B-instruct-exp2-10-2.sh        | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp2-10-3.sh        | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp2-13.sh          | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp2-14.sh          | 122 ++++++++++++++++++
 .../Llama-3.1-8B-instruct-exp2-5.sh           |   6 +-
 .../Llama-3.1-8B-instruct-exp2-6.sh           |   6 +-
 8 files changed, 496 insertions(+), 8 deletions(-)
 rename scripts/abci/instruction/Llama-3.1-8B/{Llama-3.1-8B-instruct-exp2-10.sh => Llama-3.1-8B-instruct-exp2-10-1.sh} (100%)
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-2.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-13.sh
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-14.sh

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh
index a0a3e84..2109b46 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10+.sh
@@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
-LR=1e-5
-MIN_LR=1e-6
+LR=2.5e-5
+MIN_LR=2.5e-6
 WEIGHT_DECAY=0.1
 GRAD_CLIP=1
 
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-1.sh
similarity index 100%
rename from scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10.sh
rename to scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-1.sh
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-2.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-2.sh
new file mode 100644
index 0000000..b2fb469
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-2.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=2:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=2.5e-5
+MIN_LR=2.5e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted-no-oasst-en-lmsys
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh
new file mode 100644
index 0000000..dbfeed3
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=2:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=2.5e-5
+MIN_LR=2.5e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10-3/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-no-oasst-en-lmsys
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-3-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-13.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-13.sh
new file mode 100644
index 0000000..9fd9417
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-13.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=3.5e-5
+MIN_LR=3.5e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-13/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-13-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-14.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-14.sh
new file mode 100644
index 0000000..1549fb1
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-14.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=1:00:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=5e-5
+MIN_LR=5e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-14/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.25-redacted
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-14-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
index b7498c2..6ed7988 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-5.sh
@@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
-LR=1e-5
-MIN_LR=1e-6
+LR=2.5e-5
+MIN_LR=2.5e-6
 WEIGHT_DECAY=0.1
 GRAD_CLIP=1
 
@@ -67,7 +67,7 @@ CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp
 mkdir -p ${CHECKPOINT_SAVE_DIR}
 
 # dataset
-DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0-redacted
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-redacted
 
 TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
 VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
index 8dd8d21..95fcea5 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
@@ -54,8 +54,8 @@ MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
 
 # optimizer config
-LR=1e-5
-MIN_LR=1e-6
+LR=2.5e-5
+MIN_LR=2.5e-6
 WEIGHT_DECAY=0.1
 GRAD_CLIP=1
 
@@ -67,7 +67,7 @@ CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp
 mkdir -p ${CHECKPOINT_SAVE_DIR}
 
 # dataset
-DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-7-next_token-0.5-redacted
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0.5-redacted
 
 TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
 VALID_DATA_PATH=${DATASET_DIR}/train.jsonl

From 28a63ccca42d164060d13e96375c50571d4fb5da Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sat, 14 Sep 2024 16:04:05 +0900
Subject: [PATCH 38/44] feat: update merge script

---
 tools/dataset/merge_dataset.sh | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index 9c36ee0..e80e9a6 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -2,9 +2,9 @@
 
 set -e
 
-INCLUDE_REDACTED=true
-FILTERD_SCORE=7
-NEXT_TOKEN_PERCENT=0.25
+INCLUDE_REDACTED=false
+FILTERD_SCORE=0
+NEXT_TOKEN_PERCENT=0
 USE_OPEN_ASSISTANT=false
 USE_ONLY_ENGLISH_OPEN_ASSISTANT=false
 USE_ENGLISH_LMSYS=true
@@ -68,14 +68,18 @@ fi
 if $INCLUDE_REDACTED; then
   JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train.jsonl
 else
-  JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-no-redacted.jsonl
+  JA_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-ja-no-redacted.jsonl
 fi
 
 cat $JA_LMSYS_FILE >> $OUTPUT_DIR/train.jsonl
 
 # 英語のLMSYSデータを追加でオプションとして使用
 if $USE_ENGLISH_LMSYS; then
-  EN_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-en.jsonl
+  if $INCLUDE_REDACTED; then
+    EN_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-en.jsonl
+  else
+    EN_LMSYS_FILE=/bb/llm/gaf51275/datasets/raw/instruct/lmsys-chat-1m/sft/lmsys-chat-1m-train-en-no-redacted.jsonl
+  fi
   cat $EN_LMSYS_FILE >> $OUTPUT_DIR/train.jsonl
   echo "Added English LMSYS data"
 fi

From 156f12cab6c37118f60982a1dde08d66268ef147 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sat, 14 Sep 2024 16:04:23 +0900
Subject: [PATCH 39/44] feat: change dataset path for exp2-10-3

---
 .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh
index dbfeed3..f7872b6 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-3.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #$ -l rt_AF=8
-#$ -l h_rt=2:00:00:00
+#$ -l h_rt=1:16:00:00
 #$ -j y
 #$ -o outputs/instruction/Llama-3.1-8B/
 #$ -cwd

From c4e85979f0e5d78685358414ab6db1053fe7341d Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sat, 14 Sep 2024 16:54:20 +0900
Subject: [PATCH 40/44] feat: update dataset merge script (for exp2-10-4)

---
 tools/dataset/merge_dataset.sh | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index e80e9a6..16534b7 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -2,27 +2,41 @@
 
 set -e
 
+# Control variables
 INCLUDE_REDACTED=false
 FILTERD_SCORE=0
 NEXT_TOKEN_PERCENT=0
 USE_OPEN_ASSISTANT=false
 USE_ONLY_ENGLISH_OPEN_ASSISTANT=false
 USE_ENGLISH_LMSYS=true
+USE_MAGPIE_ULTRA=true
+CUSTOM_OUTPUT_DIR=""
 
-OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT
+# Base output directory
+BASE_OUTPUT_DIR="/bb/llm/gaf51275/datasets/raw/instruct/training"
+
+if [ -z "$CUSTOM_OUTPUT_DIR" ]; then
+  OUTPUT_DIR="$BASE_OUTPUT_DIR/exp2-filtered-$FILTERD_SCORE-next_token-$NEXT_TOKEN_PERCENT"
+else
+  OUTPUT_DIR="$CUSTOM_OUTPUT_DIR"
+fi
 
 if $INCLUDE_REDACTED; then
-  OUTPUT_DIR=$OUTPUT_DIR-redacted
+  OUTPUT_DIR="${OUTPUT_DIR}-redacted"
 fi
 
 if ! $USE_OPEN_ASSISTANT; then
-  OUTPUT_DIR=$OUTPUT_DIR-no-oasst
+  OUTPUT_DIR="${OUTPUT_DIR}-no-oasst"
 elif $USE_ONLY_ENGLISH_OPEN_ASSISTANT; then
-  OUTPUT_DIR=$OUTPUT_DIR-en-oasst
+  OUTPUT_DIR="${OUTPUT_DIR}-en-oasst"
 fi
 
 if $USE_ENGLISH_LMSYS; then
-  OUTPUT_DIR=$OUTPUT_DIR-en-lmsys
+  OUTPUT_DIR="${OUTPUT_DIR}-en-lmsys"
+fi
+
+if $USE_MAGPIE_ULTRA; then
+  OUTPUT_DIR="${OUTPUT_DIR}-magpie-ultra"
 fi
 
 mkdir -p $OUTPUT_DIR
@@ -84,6 +98,13 @@ if $USE_ENGLISH_LMSYS; then
   echo "Added English LMSYS data"
 fi
 
+# Add magpie-ultra dataset processing
+if $USE_MAGPIE_ULTRA; then
+  MAGPIE_ULTRA_FILE=/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/train.jsonl
+  cat $MAGPIE_ULTRA_FILE >> $OUTPUT_DIR/train.jsonl
+  echo "Added magpie-ultra data"
+fi
+
 INSTRUCTION_SAMPLES=$(wc -l $OUTPUT_DIR/train.jsonl | awk '{print $1}')
 NEXT_TOKEN_SAMPLES=$(echo "($INSTRUCTION_SAMPLES / (1 - $NEXT_TOKEN_PERCENT)) * $NEXT_TOKEN_PERCENT / 1" | bc)
 

From 2c622153f0185bd256e3860a51567908f117b7d0 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sat, 14 Sep 2024 16:54:29 +0900
Subject: [PATCH 41/44] feat: exp2-10-4

---
 .../Llama-3.1-8B-instruct-exp2-10-4.sh        | 122 ++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh
new file mode 100644
index 0000000..2661f9f
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=8
+#$ -l h_rt=1:16:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=2.5e-5
+MIN_LR=2.5e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10-4/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-no-oasst-en-lmsys-magpie-ultra
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-4-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"

From 9ce970ee6da258c93b4e55e83fb2c50bd18ff5ab Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sat, 14 Sep 2024 16:54:44 +0900
Subject: [PATCH 42/44] feat: implement for magpie-ultra

---
 .../dataset/converter/convert_magpie_ultra.py | 85 +++++++++++++++++++
 tools/dataset/merge_magpie_ultra.sh           | 23 +++++
 2 files changed, 108 insertions(+)
 create mode 100644 tools/dataset/converter/convert_magpie_ultra.py
 create mode 100644 tools/dataset/merge_magpie_ultra.sh

diff --git a/tools/dataset/converter/convert_magpie_ultra.py b/tools/dataset/converter/convert_magpie_ultra.py
new file mode 100644
index 0000000..8e64b48
--- /dev/null
+++ b/tools/dataset/converter/convert_magpie_ultra.py
@@ -0,0 +1,85 @@
+import argparse
+import json
+import sys
+
+
+def process_input(input_file):
+    data = []
+    with open(input_file, "r", encoding="utf-8") as f:
+        for line in f:
+            try:
+                item = json.loads(line.strip())
+                data.append(item)
+            except json.JSONDecodeError as e:
+                print(f"Warning: Skipping invalid JSON line: {line.strip()}", file=sys.stderr)
+    return data
+
+
+def convert_to_output(input_data, include_english: bool = False):
+    output_data = []
+    for item in input_data:
+        if item.get("quality") in ["average", "good", "excellent"]:
+            output_item = {
+                "input": [
+                    {
+                        "role": "user",
+                        "content": item["processed_translated_instruction"]
+                    }
+                ],
+                "output": {
+                    "role": "assistant",
+                    "content": item["processed_translated_response"]
+                },
+                "quality": item["quality"],
+                "primary_tag": item["primary_tag"],
+            }
+            output_data.append(output_item)
+            if include_english:
+                en_output_item = {
+                    "input": [
+                        {
+                            "role": "user",
+                            "content": item["instruction"],
+                        }
+                    ],
+                    "output": {
+                        "role": "assistant",
+                        "content": item["response"]
+                    },
+                    "quality": item["quality"],
+                    "primary_tag": item["primary_tag"],
+                }
+                output_data.append(en_output_item)
+
+    return output_data
+
+
+def save_output(output_data, output_file):
+    with open(output_file, "w", encoding="utf-8") as f:
+        for item in output_data:
+            json.dump(item, f, ensure_ascii=False)
+            f.write("\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert input JSONL to output JSONL")
+    parser.add_argument("--input", required=True, help="Input JSONL file path")
+    parser.add_argument("--output", required=True, help="Output JSONL file path")
+    parser.add_argument("--include-english", action="store_true")
+
+    args = parser.parse_args()
+
+    try:
+        input_data = process_input(args.input)
+        output_data = convert_to_output(
+            input_data=input_data, include_english=args.include_english
+        )
+        save_output(output_data, args.output)
+        print(f"Conversion completed. Output saved to {args.output}")
+    except Exception as e:
+        print(f"An error occurred: {str(e)}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/dataset/merge_magpie_ultra.sh b/tools/dataset/merge_magpie_ultra.sh
new file mode 100644
index 0000000..7513078
--- /dev/null
+++ b/tools/dataset/merge_magpie_ultra.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+OUTPUT_DIR=/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data
+
+mkdir -p $OUTPUT_DIR
+rm $OUTPUT_DIR/merged.jsonl
+
+FILES=(
+  "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00000-of-00002_1.jsonl"
+  "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00000-of-00002_2.jsonl"
+  "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00000-of-00002_3.jsonl"
+  "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00001-of-00002_1.jsonl"
+  "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00001-of-00002_2.jsonl"
+  "/bb/llm/gaf51275/datasets/raw/instruct/synthetic/magpie-ultra-v0.1/data/lm_train-00001-of-00002_3.jsonl"
+)
+
+MERGED_FILE=$OUTPUT_DIR/merged.jsonl
+
+for FILE in "${FILES[@]}"; do
+  cat $FILE >> $MERGED_FILE
+done
+
+wc -l $MERGED_FILE

From 561f0f464c8b0dcd5adedabdec0ab9b2f5b50d9f Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Sat, 14 Sep 2024 17:08:54 +0900
Subject: [PATCH 43/44] feat: exp2-10-5

---
 .../Llama-3.1-8B-instruct-exp2-10-5.sh        | 122 ++++++++++++++++++
 tools/dataset/merge_dataset.sh                |  12 ++
 2 files changed, 134 insertions(+)
 create mode 100644 scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-5.sh

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-5.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-5.sh
new file mode 100644
index 0000000..1f30b07
--- /dev/null
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-5.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#$ -l rt_AF=16
+#$ -l h_rt=1:16:00:00
+#$ -j y
+#$ -o outputs/instruction/Llama-3.1-8B/
+#$ -cwd
+
+# module load
+source /etc/profile.d/modules.sh
+module use /bb/llm/gaf51275/modules/modulefiles
+
+module load cuda/12.1/12.1.1
+module load cudnn/cuda-12.1/9.0.0
+module load nccl/2.20.5
+module load hpcx/2.12
+module load gcc/11.4.0
+
+# swich virtual env
+source .env/bin/activate
+
+# distributed settings
+export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
+export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
+
+echo "MASTER_ADDR=${MASTER_ADDR}"
+
+# hostfile
+
+if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
+  export NUM_GPU_PER_NODE=4
+  NODE_TYPE="v100"
+elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
+  export NUM_GPU_PER_NODE=8
+  NODE_TYPE="a100"
+else
+  echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
+fi
+
+NUM_NODES=$NHOSTS
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+
+mkdir -p ./hostfile
+
+HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
+while read -r line; do
+  echo "${line} slots=${NUM_GPU_PER_NODE}"
+done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
+
+# training config
+SEQ_LENGTH=8192
+DATA_PARALLEL_SIZE=$NUM_GPUS
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=256
+
+# optimizer config
+LR=2.5e-5
+MIN_LR=2.5e-6
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+# checkpoint
+TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
+CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
+CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp2-10-5/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
+
+mkdir -p ${CHECKPOINT_SAVE_DIR}
+
+# dataset
+DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp2-filtered-0-next_token-0-no-oasst-en-lmsys-magpie-ultra-gemma-magpie
+
+TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
+VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
+
+# job name
+JOB_NAME="Llama-3.1-8B-instruct-exp-2-10-5-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
+
+# run
+mpirun -np $NUM_GPUS \
+  --npernode $NUM_GPU_PER_NODE \
+  -hostfile $HOSTFILE_NAME \
+  -x MASTER_ADDR=$MASTER_ADDR \
+  -x MASTER_PORT=$MASTER_PORT \
+  -bind-to none \
+  -x NCCL_IB_TIMEOUT=22 \
+  -x LD_LIBRARY_PATH \
+  -x PATH \
+  python examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --hf-transformer-model-dir ${TOKENIZER_DIR} \
+  --instruction-train-data-path ${TRAIN_DATA_PATH} \
+  --instruction-valid-data-path ${VALID_DATA_PATH} \
+  --epoch 2 \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-8 \
+  --save-interval 500 \
+  --eval-interval 500000 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${CHECKPOINT_DIR} \
+  --save ${CHECKPOINT_SAVE_DIR} \
+  --load ${CHECKPOINT_SAVE_DIR} \
+  --low-cpu-fsdp \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --fsdp-activation-checkpointing \
+  --instruction-tuning \
+  --save-sampler-state \
+  --use-mpi \
+  --wandb-entity "prj-jalm" \
+  --wandb-project "Llama-3.1-8B-Instruct" \
+  --wandb-name "${JOB_NAME}"
diff --git a/tools/dataset/merge_dataset.sh b/tools/dataset/merge_dataset.sh
index 16534b7..b02410b 100644
--- a/tools/dataset/merge_dataset.sh
+++ b/tools/dataset/merge_dataset.sh
@@ -10,6 +10,7 @@ USE_OPEN_ASSISTANT=false
 USE_ONLY_ENGLISH_OPEN_ASSISTANT=false
 USE_ENGLISH_LMSYS=true
 USE_MAGPIE_ULTRA=true
+USE_GEMMA_MAGPIE=true
 CUSTOM_OUTPUT_DIR=""
 
 # Base output directory
@@ -39,6 +40,10 @@ if $USE_MAGPIE_ULTRA; then
   OUTPUT_DIR="${OUTPUT_DIR}-magpie-ultra"
 fi
 
+if $USE_GEMMA_MAGPIE; then
+  OUTPUT_DIR="${OUTPUT_DIR}-gemma-magpie"
+fi
+
 mkdir -p $OUTPUT_DIR
 
 if $USE_OPEN_ASSISTANT; then
@@ -105,6 +110,13 @@ if $USE_MAGPIE_ULTRA; then
   echo "Added magpie-ultra data"
 fi
 
+# Add gemma-magpie dataset processing
+if $USE_GEMMA_MAGPIE; then
+  GEMMA_MAGPIE_FILE=/bb/llm/gaf51275/datasets/raw/instruct/MAGPIE/gemma2-27b-it/format.jsonl
+  cat $GEMMA_MAGPIE_FILE >> $OUTPUT_DIR/train.jsonl
+  echo "Added gemma-magpie data"
+fi
+
 INSTRUCTION_SAMPLES=$(wc -l $OUTPUT_DIR/train.jsonl | awk '{print $1}')
 NEXT_TOKEN_SAMPLES=$(echo "($INSTRUCTION_SAMPLES / (1 - $NEXT_TOKEN_PERCENT)) * $NEXT_TOKEN_PERCENT / 1" | bc)
 

From 3a9da08c09f3ad3de0e5bc7f3dc9c41fdbb886b8 Mon Sep 17 00:00:00 2001
From: kazuki <kaz.tokyo.tech20@gmail.com>
Date: Mon, 16 Sep 2024 00:18:35 +0900
Subject: [PATCH 44/44] chore: update job running-time

---
 .../Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh           | 4 ++--
 .../instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh
index 2661f9f..63810b5 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-10-4.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-#$ -l rt_AF=8
-#$ -l h_rt=1:16:00:00
+#$ -l rt_AF=16
+#$ -l h_rt=1:8:00:00
 #$ -j y
 #$ -o outputs/instruction/Llama-3.1-8B/
 #$ -cwd
diff --git a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
index 95fcea5..21b4b8e 100644
--- a/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
+++ b/scripts/abci/instruction/Llama-3.1-8B/Llama-3.1-8B-instruct-exp2-6.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-#$ -l rt_AF=8
-#$ -l h_rt=2:00:00:00
+#$ -l rt_AF=16
+#$ -l h_rt=0:20:00:00
 #$ -j y
 #$ -o outputs/instruction/Llama-3.1-8B/
 #$ -cwd