Skip to content

Commit

Permalink
feat: instruct model
Browse files Browse the repository at this point in the history
  • Loading branch information
okoge-kaz committed Aug 4, 2024
1 parent 6b98926 commit f3a9b95
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/sh
#$ -cwd
#$ -l node_f=2
#$ -l h_rt=1:00:00:00
#$ -l node_f=4
#$ -l h_rt=0:20:00:00
#$ -o outputs/Llama-3-8b-instruct/$JOB_ID.log
#$ -e outputs/Llama-3-8b-instruct/$JOB_ID.log
#$ -p -5
Expand Down Expand Up @@ -61,8 +61,8 @@ mkdir -p ${CHECKPOINT_SAVE_DIR}
# dataset
DATASET_DIR=/gs/bs/tga-NII-LLM/datasets/raw/instruct/synthetic/general/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k

TRAIN_DATA_PATH=${DATASET_DIR}/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k.jsonl
VALID_DATA_PATH=${DATASET_DIR}/Synthetic-JP-Conversations-Magpie-Nemotron-4-10k.jsonl
TRAIN_DATA_PATH=${DATASET_DIR}/converted.jsonl
VALID_DATA_PATH=${DATASET_DIR}/converted.jsonl

# job name
JOB_NAME="Llama-3-8B-instruct-v0.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
Expand Down
42 changes: 42 additions & 0 deletions tools/dataset/convert_dataset_instruct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import argparse
import json
import copy


def convert_jsonl(input_path: str, output_path: str) -> None:
converted_data = []

with open(input_path, 'r', encoding='utf-8') as file:
for line in file:
item = json.loads(line)
messages = item['messages']

assert len(messages) % 2 == 0
conversation_turn: int = len(messages) // 2

inputs = []
for i in range(conversation_turn):
user_message = messages[i * 2]
assistant_message = messages[i * 2 + 1]
inputs.append(user_message)
converted_data.append({
"input": copy.deepcopy(inputs),
"output": assistant_message
})
inputs.append(assistant_message)

with open(output_path, 'w', encoding='utf-8') as outfile:
for item in converted_data:
outfile.write(json.dumps(item, ensure_ascii=False) + '\n')

def main():
parser = argparse.ArgumentParser(description="Convert JSONL file keys to specified format.")
parser.add_argument('--input-path', type=str, help='Path to the input JSONL file')
parser.add_argument('--output-path', type=str, help='Path to the output JSONL file')

args = parser.parse_args()

convert_jsonl(args.input_path, args.output_path)

if __name__ == "__main__":
main()

0 comments on commit f3a9b95

Please sign in to comment.