Skip to content

Commit

Permalink
fix ckpt loading for nemo1 and t5 tests
Browse files Browse the repository at this point in the history
Signed-off-by: Chen Cui <chcui@nvidia.com>
  • Loading branch information
cuichenx authored and ko3n1g committed Jan 25, 2025
1 parent 0733b7f commit d6fd229
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 0 deletions.
13 changes: 13 additions & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,7 @@ jobs:
trainer.val_check_interval=5 \
trainer.limit_val_batches=2 \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.kd_teacher_restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
model.tensor_model_parallel_size=2 \
model.pipeline_model_parallel_size=1 \
Expand All @@ -564,6 +565,7 @@ jobs:
trainer.num_nodes=1 \
trainer.precision=bf16 \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=2 \
prune.num_calib_size=8 \
Expand All @@ -585,6 +587,7 @@ jobs:
trainer.num_nodes=1 \
trainer.precision=bf16 \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.tensor_model_parallel_size=2 \
model.pipeline_model_parallel_size=1 \
'prune.drop_layers=[1]' \
Expand Down Expand Up @@ -2764,6 +2767,7 @@ jobs:
model.peft.peft_scheme=none \
model.optim.name=distributed_fused_adam \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=1 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
Expand Down Expand Up @@ -2791,6 +2795,7 @@ jobs:
trainer.max_steps=20 \
trainer.val_check_interval=10 \
model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
++model.dist_ckpt_load_strictness=log_all \
model.peft.lora_tuning.adapter_dim=8 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \
model.data.validation_ds.write_embeddings_to_file=True \
Expand All @@ -2814,6 +2819,7 @@ jobs:
trainer.max_steps=20 \
trainer.val_check_interval=10 \
model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
++model.dist_ckpt_load_strictness=log_all \
model.peft.lora_tuning.adapter_dim=8 \
model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
Expand Down Expand Up @@ -2854,6 +2860,7 @@ jobs:
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.megatron_amp_O2=True \
model.peft.peft_scheme=lora \
model.answer_only_loss=True \
Expand All @@ -2868,6 +2875,7 @@ jobs:
python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.peft.restore_from_path=/tmp/nlp_peft_lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
Expand Down Expand Up @@ -2903,6 +2911,7 @@ jobs:
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.peft.peft_scheme="lora" \
model.answer_only_loss=True \
model.micro_batch_size=1 \
Expand All @@ -2916,6 +2925,7 @@ jobs:
python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.peft.restore_from_path=/tmp/nlp_peft_lora_tuning_pp2_o1/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
model.tensor_model_parallel_size=2 \
trainer.devices=2 \
Expand Down Expand Up @@ -2952,6 +2962,7 @@ jobs:
model.sequence_parallel=True \
model.megatron_amp_O2=True \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
++model.dist_ckpt_load_strictness=log_all \
+model.fp8=True \
+model.fp8_params=True \
+model.fp8_hybrid=True \
Expand Down Expand Up @@ -3565,6 +3576,7 @@ jobs:
SCRIPT: |
python examples/nlp/language_modeling/megatron_t5_eval.py \
--model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
++model.dist_ckpt_load_strictness=log_all \
--prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
--tensor_model_parallel_size 1
Expand All @@ -3587,6 +3599,7 @@ jobs:
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.peft.peft_scheme=lora \
model.answer_only_loss=True \
model.micro_batch_size=1 \
Expand Down
1 change: 1 addition & 0 deletions tests/collections/llm/megatron_t5_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def get_args():
pipeline_model_parallel_size=1,
pipeline_dtype=torch.float32,
ckpt_load_optimizer=False,
ckpt_load_strictness="log_all", # Only for CI tests to use older versions of checkpoint
)
checkpoint_callback = ModelCheckpoint(
every_n_train_steps=5000,
Expand Down

0 comments on commit d6fd229

Please sign in to comment.