fix ckpt loading for nemo1 and t5 tests

Signed-off-by: Chen Cui <chcui@nvidia.com>
NVIDIA · Jan 25, 2025 · d6fd229 · d6fd229
1 parent 0733b7f
commit d6fd229
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 0 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -541,6 +541,7 @@ jobs:
           trainer.val_check_interval=5 \
           trainer.limit_val_batches=2 \
           model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          ++model.dist_ckpt_load_strictness=log_all \
           model.kd_teacher_restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
           model.tensor_model_parallel_size=2 \
           model.pipeline_model_parallel_size=1 \
@@ -564,6 +565,7 @@ jobs:
           trainer.num_nodes=1 \
           trainer.precision=bf16 \
           model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          ++model.dist_ckpt_load_strictness=log_all \
           model.tensor_model_parallel_size=1 \
           model.pipeline_model_parallel_size=2 \
           prune.num_calib_size=8 \
@@ -585,6 +587,7 @@ jobs:
           trainer.num_nodes=1 \
           trainer.precision=bf16 \
           model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          ++model.dist_ckpt_load_strictness=log_all \
           model.tensor_model_parallel_size=2 \
           model.pipeline_model_parallel_size=1 \
           'prune.drop_layers=[1]' \
@@ -2764,6 +2767,7 @@ jobs:
         model.peft.peft_scheme=none \
         model.optim.name=distributed_fused_adam \
         model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
+        ++model.dist_ckpt_load_strictness=log_all \
         model.tensor_model_parallel_size=1 \
         model.pipeline_model_parallel_size=1 \
         model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
@@ -2791,6 +2795,7 @@ jobs:
         trainer.max_steps=20 \
         trainer.val_check_interval=10 \
         model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
+        ++model.dist_ckpt_load_strictness=log_all \
         model.peft.lora_tuning.adapter_dim=8 \
         model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \
         model.data.validation_ds.write_embeddings_to_file=True \
@@ -2814,6 +2819,7 @@ jobs:
         trainer.max_steps=20 \
         trainer.val_check_interval=10 \
         model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
+        ++model.dist_ckpt_load_strictness=log_all \        
         model.peft.lora_tuning.adapter_dim=8 \
         model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
         model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
@@ -2854,6 +2860,7 @@ jobs:
         model.pipeline_model_parallel_size=2 \
         model.tensor_model_parallel_size=1 \
         model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+        ++model.dist_ckpt_load_strictness=log_all \
         model.megatron_amp_O2=True \
         model.peft.peft_scheme=lora \
         model.answer_only_loss=True \
@@ -2868,6 +2875,7 @@ jobs:
         
         python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
         model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+        ++model.dist_ckpt_load_strictness=log_all \
         model.peft.restore_from_path=/tmp/nlp_peft_lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
         model.pipeline_model_parallel_size=2 \
         model.tensor_model_parallel_size=1 \
@@ -2903,6 +2911,7 @@ jobs:
         model.pipeline_model_parallel_size=1 \
         model.tensor_model_parallel_size=2 \
         model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+        ++model.dist_ckpt_load_strictness=log_all \
         model.peft.peft_scheme="lora" \
         model.answer_only_loss=True \
         model.micro_batch_size=1 \
@@ -2916,6 +2925,7 @@ jobs:
 
         python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
         model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+        ++model.dist_ckpt_load_strictness=log_all \
         model.peft.restore_from_path=/tmp/nlp_peft_lora_tuning_pp2_o1/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
         model.tensor_model_parallel_size=2 \
         trainer.devices=2 \
@@ -2952,6 +2962,7 @@ jobs:
         model.sequence_parallel=True \
         model.megatron_amp_O2=True \
         model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+        ++model.dist_ckpt_load_strictness=log_all \
         +model.fp8=True \
         +model.fp8_params=True \
         +model.fp8_hybrid=True \
@@ -3565,6 +3576,7 @@ jobs:
       SCRIPT: |
         python examples/nlp/language_modeling/megatron_t5_eval.py \
             --model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
+            ++model.dist_ckpt_load_strictness=log_all \
             --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
             --tensor_model_parallel_size 1
 
@@ -3587,6 +3599,7 @@ jobs:
         model.pipeline_model_parallel_size=1 \
         model.tensor_model_parallel_size=2 \
         model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
+        ++model.dist_ckpt_load_strictness=log_all \
         model.peft.peft_scheme=lora \
         model.answer_only_loss=True \
         model.micro_batch_size=1 \

diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py
@@ -87,6 +87,7 @@ def get_args():
         pipeline_model_parallel_size=1,
         pipeline_dtype=torch.float32,
         ckpt_load_optimizer=False,
+        ckpt_load_strictness="log_all",  # Only for CI tests to use older versions of checkpoint
     )
     checkpoint_callback = ModelCheckpoint(
         every_n_train_steps=5000,