diff --git a/internlm/core/trainer_builder.py b/internlm/core/trainer_builder.py index 7dbaec05..6f448aad 100644 --- a/internlm/core/trainer_builder.py +++ b/internlm/core/trainer_builder.py @@ -213,7 +213,8 @@ def _initialize_memory_profiler(self, model, optimizer, profiling) -> Optional[S + f"wp{gpc.get_local_rank(ParallelMode.WEIGHT)}_" + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}", ) - return None + else: + return None def _initialize_batch_skipper(self, train_state) -> BatchSkipper: skip_batches = gpc.config.data.skip_batches @@ -346,7 +347,11 @@ def _record_metrics(self, batch_count: int, batch, start_time, loss, moe_loss, s ) def _should_evaluate(self) -> bool: - return gpc.config.data.valid_every > 0 and self.train_state.step_count % gpc.config.data.valid_every == 0 + return ( + gpc.config.data.valid_every > 0 + and self.train_state.step_count > 0 + and self.train_state.step_count % gpc.config.data.valid_every == 0 + ) def _evaluate(self): evaluate_on_val_dls(