From f3a9b95912d8b562af96fdedfcf1919744ddf64c Mon Sep 17 00:00:00 2001 From: kazuki Date: Sun, 4 Aug 2024 10:24:21 +0900 Subject: [PATCH] feat: instruct model --- .../Llama-3-8B/Llama-3-8B-instruct-v0.2.sh | 8 ++-- tools/dataset/convert_dataset_instruct.py | 42 +++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 tools/dataset/convert_dataset_instruct.py diff --git a/scripts/tsubame/instruct/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh b/scripts/tsubame/instruct/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh index 30846bf..fa9e3f5 100644 --- a/scripts/tsubame/instruct/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh +++ b/scripts/tsubame/instruct/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh @@ -1,7 +1,7 @@ #!/bin/sh #$ -cwd -#$ -l node_f=2 -#$ -l h_rt=1:00:00:00 +#$ -l node_f=4 +#$ -l h_rt=0:20:00:00 #$ -o outputs/Llama-3-8b-instruct/$JOB_ID.log #$ -e outputs/Llama-3-8b-instruct/$JOB_ID.log #$ -p -5 @@ -61,8 +61,8 @@ mkdir -p ${CHECKPOINT_SAVE_DIR} # dataset DATASET_DIR=/gs/bs/tga-NII-LLM/datasets/raw/instruct/synthetic/general/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k -TRAIN_DATA_PATH=${DATASET_DIR}/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k.jsonl -VALID_DATA_PATH=${DATASET_DIR}/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k.jsonl +TRAIN_DATA_PATH=${DATASET_DIR}/converted.jsonl +VALID_DATA_PATH=${DATASET_DIR}/converted.jsonl # job name JOB_NAME="Llama-3-8B-instruct-v0.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}" diff --git a/tools/dataset/convert_dataset_instruct.py b/tools/dataset/convert_dataset_instruct.py new file mode 100644 index 0000000..ed35caf --- /dev/null +++ b/tools/dataset/convert_dataset_instruct.py @@ -0,0 +1,42 @@ +import argparse +import json +import copy + + +def convert_jsonl(input_path: str, output_path: str) -> None: + converted_data = [] + + with open(input_path, 'r', encoding='utf-8') as file: + for line in file: + item = json.loads(line) + messages = item['messages'] + + assert len(messages) % 2 == 0 + conversation_turn: int = len(messages) // 2 + + inputs = [] + for i in range(conversation_turn): + user_message = messages[i * 2] + assistant_message = messages[i * 2 + 1] + inputs.append(user_message) + converted_data.append({ + "input": copy.deepcopy(inputs), + "output": assistant_message + }) + inputs.append(assistant_message) + + with open(output_path, 'w', encoding='utf-8') as outfile: + for item in converted_data: + outfile.write(json.dumps(item, ensure_ascii=False) + '\n') + +def main(): + parser = argparse.ArgumentParser(description="Convert JSONL file keys to specified format.") + parser.add_argument('--input-path', type=str, help='Path to the input JSONL file') + parser.add_argument('--output-path', type=str, help='Path to the output JSONL file') + + args = parser.parse_args() + + convert_jsonl(args.input_path, args.output_path) + +if __name__ == "__main__": + main()