From c3dfe0fdc10ee241eca81f278d46d3ae77f42bd6 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 18 Oct 2024 17:07:27 +0800 Subject: [PATCH] fix(checkpoint/components.py): fix lr scheduler resume step count (#351) --- internlm/checkpoint/components.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internlm/checkpoint/components.py b/internlm/checkpoint/components.py index 30fc5613..edf39b55 100644 --- a/internlm/checkpoint/components.py +++ b/internlm/checkpoint/components.py @@ -441,7 +441,9 @@ def load_scheduler(ckpt_path: str, lr_scheduler, optimizer, train_state: TrainSt ) lr_scheduler.load_state_dict(scheduler_states) - lr_scheduler.last_epoch = train_state.step_count + 1 + + # step_count have been updated before saving checkpoint. + lr_scheduler.last_epoch = train_state.step_count # compatible with old code that only have one param group if len(base_lrs) == 1: