diff --git a/CHANGELOG.md b/CHANGELOG.md
index 88488369..f50ee7f0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,8 @@ Keep it human-readable, your future self will thank you!
 Fixed bug in power spectra plotting for the n320 resolution.
 
 ### Added
+- Introduce variable to configure (Cosine Annealing) optimizer warm up [#155](https://github.com/ecmwf/anemoi-training/pull/155)
+
 
 - Add reader groups to reduce CPU memory usage and increase dataloader throughput [#76](https://github.com/ecmwf/anemoi-training/pull/76)
 
diff --git a/docs/user-guide/training.rst b/docs/user-guide/training.rst
index 5be08222..588b34d9 100644
--- a/docs/user-guide/training.rst
+++ b/docs/user-guide/training.rst
@@ -188,10 +188,11 @@ level has a weighting less than 0.2).
 ***************
 
 Anemoi training uses the ``CosineLRScheduler`` from PyTorch as it's
-learning rate scheduler. The user can configure the maximum learning
-rate by setting ``config.training.lr.rate``. Note that this learning
-rate is scaled by the number of GPUs where for the `data parallelism
-<distributed>`_.
+learning rate scheduler. Docs for this scheduler can be found here
+https://github.com/huggingface/pytorch-image-models/blob/main/timm/scheduler/cosine_lr.py
+The user can configure the maximum learning rate by setting
+``config.training.lr.rate``. Note that this learning rate is scaled by
+the number of GPUs where for the `data parallelism <distributed>`_.
 
 .. code:: yaml
 
@@ -201,7 +202,11 @@ The user can also control the rate at which the learning rate decreases
 by setting the total number of iterations through
 ``config.training.lr.iterations`` and the minimum learning rate reached
 through ``config.training.lr.min``. Note that the minimum learning rate
-is not scaled by the number of GPUs.
+is not scaled by the number of GPUs. The user can also control the
+warmup period by setting ``config.training.lr.warmup_t``. If the warmup
+period is set to 0, the learning rate will start at the maximum learning
+rate. If no warmup period is defined, a default warmup period of 1000
+iterations is used.
 
 *********
  Rollout
diff --git a/src/anemoi/training/config/training/default.yaml b/src/anemoi/training/config/training/default.yaml
index 1c103827..af168ecc 100644
--- a/src/anemoi/training/config/training/default.yaml
+++ b/src/anemoi/training/config/training/default.yaml
@@ -83,6 +83,7 @@ lr:
   rate: 0.625e-4 #local_lr
   iterations: ${training.max_steps} # NOTE: When max_epochs < max_steps, scheduler will run for max_steps
   min: 3e-7 #Not scaled by #GPU
+  warmup_t: 1000
 
 # Changes in per-gpu batch_size should come with a rescaling of the local_lr
 # in order to keep a constant global_lr
diff --git a/src/anemoi/training/train/forecaster.py b/src/anemoi/training/train/forecaster.py
index 659c906c..a3abd59c 100644
--- a/src/anemoi/training/train/forecaster.py
+++ b/src/anemoi/training/train/forecaster.py
@@ -127,6 +127,7 @@ def __init__(
             * config.training.lr.rate
             / config.hardware.num_gpus_per_model
         )
+        self.warmup_t = getattr(config.training.lr, "warmup_t", 1000)
         self.lr_iterations = config.training.lr.iterations
         self.lr_min = config.training.lr.min
         self.rollout = config.training.rollout.start
@@ -638,6 +639,6 @@ def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict]]
             optimizer,
             lr_min=self.lr_min,
             t_initial=self.lr_iterations,
-            warmup_t=1000,
+            warmup_t=self.warmup_t,
         )
         return [optimizer], [{"scheduler": scheduler, "interval": "step"}]