add pretrain tests for hf (#11673)

* add pretrain tests Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * Apply isort and black reformatting Signed-off-by: akoumpa <akoumpa@users.noreply.github.com> * fix Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * add max_steps Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> --------- Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> Signed-off-by: akoumpa <akoumpa@users.noreply.github.com> Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
NVIDIA · Jan 7, 2025 · 8f3485e · 8f3485e
1 parent 731e812
commit 8f3485e
Show file tree

Hide file tree

Showing 4 changed files with 266 additions and 1 deletion.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -3701,6 +3701,17 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
+  L2_HF_Transformer_PT_2gpu:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
   L2_HF_Transformer_SFT_2gpu_nemorun:
     needs: [ cicd-test-container-setup ]
     uses: ./.github/workflows/_test_template.yml
@@ -3712,6 +3723,39 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
+  L2_HF_Transformer_PT_2gpu_nemorun:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu_nemorun') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
+  L2_HF_Transformer_PT:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
+  L2_HF_Transformer_PT_nemorun:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_nemorun') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
   L2_HF_Transformer_SFT:
     needs: [ cicd-test-container-setup ]
     uses: ./.github/workflows/_test_template.yml
@@ -3733,7 +3777,7 @@ jobs:
         TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/sft_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
-  
+
   L2_HF_Transformer_SFT_TE_Acceleration:
     needs: [ cicd-test-container-setup ]
     uses: ./.github/workflows/_test_template.yml
@@ -3745,6 +3789,17 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
+  L2_HF_Transformer_PT_TE_Acceleration:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_TE_Acceleration') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --model-accelerator te --max-steps 10
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
   # L2: Megatron Mock Data Generation
   L2_Megatron_Mock_Data_Generation_MockGPTDataset:
     needs: [cicd-test-container-setup]
@@ -4942,6 +4997,12 @@ jobs:
       - L2_VLM_HF_Transformer_PEFT_4bit
       - L2_HF_Transformer_SFT_2gpu_nemorun
       - L2_HF_Transformer_SFT_TE_Acceleration
+      - L2_HF_Transformer_PT
+      - L2_HF_Transformer_PT_nemorun
+      - L2_HF_Transformer_PT_2gpu
+      - L2_HF_Transformer_PT_2gpu_nemorun
+      - L2_HF_Transformer_PT_TE_Acceleration
+      - L2_VLM_HF_Transformer_PEFT
       - L2_NeMo_2_SSM_Pretraining
       - L2_NeMo_2_SSM_Finetuning
       - L2_NeMo_2_T5_Pretraining

diff --git a/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
@@ -127,6 +127,7 @@ def pretrain_recipe(
     num_gpus_per_node: int = 8,
     fn=pretrain,
     model_name: str = '',
+    max_steps: int = 100,
 ) -> run.Partial:
     """
     Create a pre-training recipe for a HFAutoModelForCausalLM model.
@@ -159,6 +160,7 @@ def pretrain_recipe(
             num_nodes=num_nodes,
             num_gpus_per_node=num_gpus_per_node,
             callbacks=[run.Config(TimingCallback)],
+            max_steps=max_steps,
         ),
         data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
         log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),

diff --git a/tests/collections/llm/hf/pretrain.py b/tests/collections/llm/hf/pretrain.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fiddle as fdl
+from lightning.pytorch.loggers import WandbLogger
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.lightning.pytorch.accelerate.transformer_engine import is_te_accelerated
+
+
+DATA_PATH = '/home/TestData/lite/hf_cache/squad/'
+
+
+def make_squad_hf_dataset(data_path, tokenizer):
+    EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
+
+    def formatting_prompts_func(examples):
+        alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+    ### Instruction:
+    {}
+
+    ### Input:
+    {}
+
+    ### Response:
+    {}"""
+        instruction = examples["context"]
+        input = examples["question"]
+        output = examples["answers"]['text']
+        if isinstance(output, list):
+            output = output[0]
+        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
+        ans = tokenizer(text)
+        ans['labels'] = ans['input_ids']
+        return ans
+
+    tokenizer = getattr(tokenizer, 'tokenizer', tokenizer)
+    datamodule = llm.HFDatasetDataModule(data_path, split="train[:100]", pad_token_id=tokenizer.eos_token_id)
+
+    datamodule.map(
+        formatting_prompts_func,
+        batched=False,
+        batch_size=2,
+        remove_columns=["id", "title", "context", "question", 'answers'],
+    )
+
+    return datamodule
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='meta-llama/Llama-3.2-1B')
+    parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
+    parser.add_argument('--devices', default=1)
+    parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
+    parser.add_argument('--model-accelerator', default=None, choices=['te'])
+    parser.add_argument('--max-steps', type=int, default=100)
+    parser.add_argument("--fp8-autocast", default=False, action='store_true')
+    parser.add_argument('--wandb-project', type=str, default=None)
+    parser.add_argument('--model-save-path', type=str, default=None)
+    args = parser.parse_args()
+
+    wandb = None
+    if args.wandb_project is not None:
+        model = '_'.join(args.model.split('/')[-2:])
+        wandb = WandbLogger(
+            project=args.wandb_project,
+            name=f'{model}_dev{args.devices}_strat_{args.strategy}',
+        )
+    grad_clip = 0.5
+    if args.strategy == 'fsdp':
+        # See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
+        grad_clip = None
+    use_dist_samp = False
+
+    model_accelerator = None
+    if args.model_accelerator == "te":
+        from functools import partial
+        from nemo.lightning.pytorch.accelerate.transformer_engine import te_accelerate
+
+        model_accelerator = partial(te_accelerate, fp8_autocast=args.fp8_autocast)
+
+    from nemo.lightning.pytorch.accelerate.transformer_engine import te_accelerate
+
+    model = llm.HFAutoModelForCausalLM(
+        model_name=args.model, model_accelerator=model_accelerator, load_pretrained_weights=False
+    )
+    tokenizer = model.tokenizer
+
+    llm.api.finetune(
+        model=model,
+        data=make_squad_hf_dataset(DATA_PATH, tokenizer),
+        trainer=nl.Trainer(
+            devices=args.devices,
+            max_steps=args.max_steps,
+            accelerator=args.accelerator,
+            strategy=args.strategy,
+            log_every_n_steps=1,
+            limit_val_batches=0.0,
+            num_sanity_val_steps=0,
+            accumulate_grad_batches=10,
+            gradient_clip_val=grad_clip,
+            use_distributed_sampler=use_dist_samp,
+            callbacks=[],
+            logger=wandb,
+        ),
+        optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
+        log=None,
+    )
+
+    if args.model_accelerator:
+        if args.model_accelerator == "te":
+            te_acc = is_te_accelerated(model.model)
+            assert te_acc, "Transformer Engine acceleration was unsuccessful"
+            print("TE Accelerated: ", te_acc)
+
+    if args.model_save_path is not None:
+        model.save_pretrained(args.model_save_path)
diff --git a/tests/collections/llm/hf/pretrain_nemorun.py b/tests/collections/llm/hf/pretrain_nemorun.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nemo_run as run
+
+from nemo.collections import llm
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.llm.gpt.data.hf_dataset import SquadHFDataModule
+
+
+DATA_PATH = '/home/TestData/lite/hf_cache/squad/'
+
+
+def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
+    # Env vars for jobs are configured here
+    env_vars = {
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
+        "NCCL_NVLS_ENABLE": "0",
+        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
+        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "NVTE_FUSED_ATTN": "0",
+    }
+
+    executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
+
+    return executor
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='meta-llama/Llama-3.2-1B')
+    parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
+    parser.add_argument('--devices', default=1)
+    parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
+    parser.add_argument('--max-steps', type=int, default=100)
+    args = parser.parse_args()
+
+    recipe = llm.hf_auto_model_for_causal_lm.pretrain_recipe(
+        model_name=args.model,
+        name="pt",
+        num_nodes=1,
+        num_gpus_per_node=args.devices,
+        max_steps=args.max_steps,
+    )
+    recipe.trainer.val_check_interval = 50
+
+    tokenizer = llm.HFAutoModelForCausalLM.configure_tokenizer(args.model)
+    recipe.data = run.Config(
+        SquadHFDataModule,
+        path_or_dataset=DATA_PATH,
+        split="train[:100]",
+        pad_token_id=tokenizer.tokenizer.eos_token_id,
+        tokenizer=run.Config(AutoTokenizer, pretrained_model_name=args.model),
+    )
+    executor = local_executor_torchrun(nodes=recipe.trainer.num_nodes, devices=recipe.trainer.devices)
+    run.run(recipe, executor=executor)