Skip to content

Commit

Permalink
Revert #11890 and add a test that would have caught the error (#11914)
Browse files Browse the repository at this point in the history
* Revert "check restore_config first (#11890)"

This reverts commit 0075ed0.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add test that would have caught it

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix param validation

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
  • Loading branch information
cuichenx and akoumpa authored Jan 22, 2025
1 parent 7434bcd commit 7772b76
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 19 deletions.
42 changes: 23 additions & 19 deletions nemo/lightning/resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,23 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None):
if isinstance(trainer, fl.Fabric):
raise NotImplementedError("Fabric is not supported yet.")

if self.restore_config:
trainer_ckpt_path = self.get_trainer_ckpt_path(model)
if trainer_ckpt_path:
trainer.ckpt_path = trainer_ckpt_path
trainer.checkpoint_callback.last_model_path = trainer_ckpt_path
# Load artifacts
if getattr(self.restore_config, 'load_artifacts', False):
if isinstance(trainer_ckpt_path, AdapterPath):
# load tokenizer from the base model during peft resume, in case the first peft checkpoint
# is deleted before the current peft checkpoint is saved
context_path = trainer_ckpt_path.base_model_path / "context"
if not context_path.exists():
context_path = trainer_ckpt_path.base_model_path
else:
context_path = self.get_context_path(model)
model = _try_restore_tokenizer(model, context_path)

elif self.restore_config:
new_path = self._extract_path(
model=model,
path=self.restore_config.path,
Expand All @@ -123,21 +139,6 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None):

_try_restore_tokenizer(model, context_path)

elif (trainer_ckpt_path := self.get_trainer_ckpt_path(model)) is not None:
trainer.ckpt_path = trainer_ckpt_path
trainer.checkpoint_callback.last_model_path = trainer_ckpt_path
# Load artifacts
if getattr(self.restore_config, 'load_artifacts', False):
if isinstance(trainer_ckpt_path, AdapterPath):
# load tokenizer from the base model during peft resume, in case the first peft checkpoint
# is deleted before the current peft checkpoint is saved
context_path = trainer_ckpt_path.base_model_path / "context"
if not context_path.exists():
context_path = trainer_ckpt_path.base_model_path
else:
context_path = self.get_context_path(model)
model = _try_restore_tokenizer(model, context_path)

def _extract_path(
self, model: Optional[io.ConnectorMixin], path: str, adapter_path: Optional[str] = None
) -> BasePath:
Expand Down Expand Up @@ -185,9 +186,12 @@ def _find_trainer_ckpt_path(self) -> Optional[Path]:
checkpoint = None

# Use <log_dir>/checkpoints/ unless `dirpath` is set
checkpoint_dir = (
Path(self.resume_from_directory) if self.resume_from_directory else Path(Path(log_dir) / "checkpoints")
)
if self.resume_from_directory:
checkpoint_dir = Path(self.resume_from_directory)
elif log_dir is not None:
checkpoint_dir = Path(Path(log_dir) / "checkpoints")
else: # ie. if log_dir is None
return None

# when using distributed checkpointing, checkpoint_dir is a directory of directories
# we check for this here
Expand Down
4 changes: 4 additions & 0 deletions tests/collections/llm/gpt_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,7 @@ def get_args():
optim=adam,
resume=resume,
)

if args.max_steps == 6:
# assert a resume has happened for CI tests
assert 'reduced_train_loss=' in str(trainer.ckpt_path), "Resume did not happen in this resume test."

0 comments on commit 7772b76

Please sign in to comment.