diff --git a/CHANGELOG.md b/CHANGELOG.md index 88488369..f50ee7f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ Keep it human-readable, your future self will thank you! Fixed bug in power spectra plotting for the n320 resolution. ### Added +- Introduce variable to configure (Cosine Annealing) optimizer warm up [#155](https://github.com/ecmwf/anemoi-training/pull/155) + - Add reader groups to reduce CPU memory usage and increase dataloader throughput [#76](https://github.com/ecmwf/anemoi-training/pull/76) diff --git a/docs/user-guide/training.rst b/docs/user-guide/training.rst index 5be08222..588b34d9 100644 --- a/docs/user-guide/training.rst +++ b/docs/user-guide/training.rst @@ -188,10 +188,11 @@ level has a weighting less than 0.2). *************** Anemoi training uses the ``CosineLRScheduler`` from PyTorch as it's -learning rate scheduler. The user can configure the maximum learning -rate by setting ``config.training.lr.rate``. Note that this learning -rate is scaled by the number of GPUs where for the `data parallelism -`_. +learning rate scheduler. Docs for this scheduler can be found here +https://github.com/huggingface/pytorch-image-models/blob/main/timm/scheduler/cosine_lr.py +The user can configure the maximum learning rate by setting +``config.training.lr.rate``. Note that this learning rate is scaled by +the number of GPUs where for the `data parallelism `_. .. code:: yaml @@ -201,7 +202,11 @@ The user can also control the rate at which the learning rate decreases by setting the total number of iterations through ``config.training.lr.iterations`` and the minimum learning rate reached through ``config.training.lr.min``. Note that the minimum learning rate -is not scaled by the number of GPUs. +is not scaled by the number of GPUs. The user can also control the +warmup period by setting ``config.training.lr.warmup_t``. If the warmup +period is set to 0, the learning rate will start at the maximum learning +rate. If no warmup period is defined, a default warmup period of 1000 +iterations is used. ********* Rollout diff --git a/src/anemoi/training/config/training/default.yaml b/src/anemoi/training/config/training/default.yaml index 1c103827..af168ecc 100644 --- a/src/anemoi/training/config/training/default.yaml +++ b/src/anemoi/training/config/training/default.yaml @@ -83,6 +83,7 @@ lr: rate: 0.625e-4 #local_lr iterations: ${training.max_steps} # NOTE: When max_epochs < max_steps, scheduler will run for max_steps min: 3e-7 #Not scaled by #GPU + warmup_t: 1000 # Changes in per-gpu batch_size should come with a rescaling of the local_lr # in order to keep a constant global_lr diff --git a/src/anemoi/training/train/forecaster.py b/src/anemoi/training/train/forecaster.py index 659c906c..a3abd59c 100644 --- a/src/anemoi/training/train/forecaster.py +++ b/src/anemoi/training/train/forecaster.py @@ -127,6 +127,7 @@ def __init__( * config.training.lr.rate / config.hardware.num_gpus_per_model ) + self.warmup_t = getattr(config.training.lr, "warmup_t", 1000) self.lr_iterations = config.training.lr.iterations self.lr_min = config.training.lr.min self.rollout = config.training.rollout.start @@ -638,6 +639,6 @@ def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict]] optimizer, lr_min=self.lr_min, t_initial=self.lr_iterations, - warmup_t=1000, + warmup_t=self.warmup_t, ) return [optimizer], [{"scheduler": scheduler, "interval": "step"}]