Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

build: Bump PyT and TE #11814

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 43 additions & 30 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,7 @@ jobs:
trainer.val_check_interval=5 \
trainer.limit_val_batches=2 \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.kd_teacher_restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
model.tensor_model_parallel_size=2 \
model.pipeline_model_parallel_size=1 \
Expand All @@ -564,6 +565,7 @@ jobs:
trainer.num_nodes=1 \
trainer.precision=bf16 \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=2 \
prune.num_calib_size=8 \
Expand All @@ -585,6 +587,7 @@ jobs:
trainer.num_nodes=1 \
trainer.precision=bf16 \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.tensor_model_parallel_size=2 \
model.pipeline_model_parallel_size=1 \
'prune.drop_layers=[1]' \
Expand Down Expand Up @@ -2764,6 +2767,7 @@ jobs:
model.peft.peft_scheme=none \
model.optim.name=distributed_fused_adam \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=1 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
Expand Down Expand Up @@ -2791,6 +2795,7 @@ jobs:
trainer.max_steps=20 \
trainer.val_check_interval=10 \
model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
++model.dist_ckpt_load_strictness=log_all \
model.peft.lora_tuning.adapter_dim=8 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \
model.data.validation_ds.write_embeddings_to_file=True \
Expand All @@ -2814,6 +2819,7 @@ jobs:
trainer.max_steps=20 \
trainer.val_check_interval=10 \
model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
++model.dist_ckpt_load_strictness=log_all \
model.peft.lora_tuning.adapter_dim=8 \
model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
Expand Down Expand Up @@ -2854,6 +2860,7 @@ jobs:
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.megatron_amp_O2=True \
model.peft.peft_scheme=lora \
model.answer_only_loss=True \
Expand All @@ -2868,6 +2875,7 @@ jobs:

python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.peft.restore_from_path=/tmp/nlp_peft_lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
Expand Down Expand Up @@ -2903,6 +2911,7 @@ jobs:
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.peft.peft_scheme="lora" \
model.answer_only_loss=True \
model.micro_batch_size=1 \
Expand All @@ -2916,6 +2925,7 @@ jobs:

python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.peft.restore_from_path=/tmp/nlp_peft_lora_tuning_pp2_o1/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
model.tensor_model_parallel_size=2 \
trainer.devices=2 \
Expand Down Expand Up @@ -2952,6 +2962,7 @@ jobs:
model.sequence_parallel=True \
model.megatron_amp_O2=True \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
++model.dist_ckpt_load_strictness=log_all \
+model.fp8=True \
+model.fp8_params=True \
+model.fp8_hybrid=True \
Expand Down Expand Up @@ -3565,6 +3576,7 @@ jobs:
SCRIPT: |
python examples/nlp/language_modeling/megatron_t5_eval.py \
--model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
++model.dist_ckpt_load_strictness=log_all \
--prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
--tensor_model_parallel_size 1

Expand All @@ -3587,6 +3599,7 @@ jobs:
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
++model.dist_ckpt_load_strictness=log_all \
model.peft.peft_scheme=lora \
model.answer_only_loss=True \
model.micro_batch_size=1 \
Expand Down Expand Up @@ -3654,7 +3667,7 @@ jobs:
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_VLM_HF_Transformer_SFT_FSDP2') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
RUNNER: self-hosted-azure-gpus-2-h100
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/sft_fsdp2.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3
AFTER_SCRIPT: |
Expand Down Expand Up @@ -4393,7 +4406,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4403,7 +4416,7 @@ jobs:
--mbs 1

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4422,7 +4435,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4432,7 +4445,7 @@ jobs:
--mbs 2

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4451,7 +4464,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4461,7 +4474,7 @@ jobs:
--mbs 2

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4480,7 +4493,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4490,7 +4503,7 @@ jobs:
--mbs 2

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4509,7 +4522,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4519,7 +4532,7 @@ jobs:
--mbs 1 --packed

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4538,7 +4551,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4548,7 +4561,7 @@ jobs:
--mbs 1

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4567,7 +4580,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4577,7 +4590,7 @@ jobs:
--mbs 2

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4596,7 +4609,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4606,7 +4619,7 @@ jobs:
--mbs 2

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4625,7 +4638,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4635,7 +4648,7 @@ jobs:
--mbs 2

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4653,7 +4666,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4663,7 +4676,7 @@ jobs:
--mbs 1 --packed

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4681,7 +4694,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4691,7 +4704,7 @@ jobs:
--mbs 1 --packed

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4708,7 +4721,7 @@ jobs:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4718,7 +4731,7 @@ jobs:
--mbs 1 --packed

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4736,7 +4749,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand All @@ -4747,7 +4760,7 @@ jobs:
--chat_dataset_path /home/TestData/nemo2_data/chat

python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
--restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
Expand Down Expand Up @@ -4841,7 +4854,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/peft/lora_merge.py \
--lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v2/ \
--lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v3/ \
--output_path=/tmp/nemo2_lora_merge/${{ github.run_id }}

L2_NEMO_2_LoRA_Export:
Expand All @@ -4853,7 +4866,7 @@ jobs:
SCRIPT: |

python tests/collections/llm/peft/lora_export.py \
--lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v2/ \
--lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v3/ \
--output_path=/tmp/nemo2_lora_merge/${{ github.run_id }}

L2_NEMO_2_LoRA_Inference:
Expand All @@ -4865,7 +4878,7 @@ jobs:
SCRIPT: |

python scripts/llm/generate.py \
--model_path /home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v2/ \
--model_path /home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v3/ \
--tp 1 \
--pp 1 \
--devices 1 \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.10-py3
ARG IMAGE_LABEL
FROM ${BASE_IMAGE}
ARG IMAGE_LABEL
Expand Down
2 changes: 1 addition & 1 deletion reinstall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ ${PIP} uninstall -y nemo_tts

export MAMBA_FORCE_BUILD=TRUE
export CAUSAL_CONV1D_FORCE_BUILD=TRUE
export TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
export TE_TAG=2215fa5c7557b66034068816020f9f611019e457
export NEMO_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2
export APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
export CAUSAL_CONV_TAG=v1.2.2.post1
Expand Down
1 change: 1 addition & 0 deletions tests/collections/llm/gpt_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def get_args():
pipeline_model_parallel_size=args.pp_size,
# Pipeline dtype is coupled with the bf16 mixed precision plugin
pipeline_dtype=torch.bfloat16,
ckpt_load_strictness="log_all", # Only for CI tests to use older versions of checkpoint
)

trainer = nl.Trainer(
Expand Down
Loading
Loading