From 2a7fb15970545dc7a52607fb8e895037b638daf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5vard=20Haugen?= Date: Mon, 21 Oct 2024 14:52:03 +0300 Subject: [PATCH] Config for stretched grid with needed adjustments --- lumi_train.sh | 48 ++++++++++ src/anemoi/training/__init__.py | 2 +- src/anemoi/training/config/data/zarr.yaml | 10 +- .../config/diagnostics/eval_rollout.yaml | 5 +- .../training/config/graph/stretched_grid.yaml | 68 +++++++++++++ .../training/config/stretched_grid.yaml | 96 +++++++++++++++++++ .../training/config/training/default.yaml | 2 +- src/lumi_train.py | 9 ++ 8 files changed, 230 insertions(+), 10 deletions(-) create mode 100644 lumi_train.sh create mode 100644 src/anemoi/training/config/graph/stretched_grid.yaml create mode 100644 src/anemoi/training/config/stretched_grid.yaml create mode 100644 src/lumi_train.py diff --git a/lumi_train.sh b/lumi_train.sh new file mode 100644 index 00000000..a653f9e0 --- /dev/null +++ b/lumi_train.sh @@ -0,0 +1,48 @@ +#!/bin/bash +#SBATCH --output=/scratch/project_465001383/aifs/logs/name2.out +#SBATCH --error=/scratch/project_465001383/aifs/logs/name2.err +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --account=project_465001383 +#SBATCH --partition=dev-g +#SBATCH --gpus-per-node=8 +#SBATCH --time=00:15:00 +#SBATCH --job-name=aifs +#SBATCH --exclusive + +PROJECT_DIR=/pfs/lustrep4/scratch/$SLURM_JOB_ACCOUNT +CONTAINER_SCRIPT=$PROJECT_DIR/aifs/run-pytorch/run-pytorch.sh + +#CHANGE THESE: +CONTAINER=$PROJECT_DIR/aifs/container/containers/aifs-met-pytorch-2.2.0-rocm-5.6.1-py3.9-v2.0-new-correct-anemoi-models-sort-vars.sif +PYTHON_SCRIPT=$PROJECT_DIR/haugenha/anemoi-training-setup/anemoi-training-config/anemoi-training/src/lumi_train.py +VENV=/users/haugenha/work/.venv-anemoi-training + + +module load LUMI/23.09 partition/G + +export SINGULARITYENV_LD_LIBRARY_PATH=/opt/ompi/lib:${EBROOTAWSMINOFIMINRCCL}/lib:/opt/cray/xpmem/2.4.4-2.3_9.1__gff0e1d9.shasta/lib64:${SINGULARITYENV_LD_LIBRARY_PATH} + +# MPI + OpenMP bindings: https://docs.lumi-supercomputer.eu/runjobs/scheduled-jobs/distribution-binding +CPU_BIND="mask_cpu:fe000000000000,fe00000000000000,fe0000,fe000000,fe,fe00,fe00000000,fe0000000000" + +if [[ "$VENV" != "None" && -n "$VENV" ]]; then +# Set this virtual environment + export VIRTUAL_ENV=$VENV + +# Ensure the virtual environment is loaded inside the container + export PYTHONUSERBASE=$VIRTUAL_ENV + export PATH=$PATH:$VIRTUAL_ENV/bin +else + : +fi + +# run run-pytorch.sh in singularity container like recommended +# in LUMI doc: https://lumi-supercomputer.github.io/LUMI-EasyBuild-docs/p/PyTorch +srun --cpu-bind=$CPU_BIND \ + singularity exec -B /pfs:/pfs \ + -B /var/spool/slurmd \ + -B /opt/cray \ + -B /usr/lib64 \ + -B /usr/lib64/libjansson.so.4 \ + $CONTAINER $CONTAINER_SCRIPT $PYTHON_SCRIPT \ No newline at end of file diff --git a/src/anemoi/training/__init__.py b/src/anemoi/training/__init__.py index d9a51e0b..c77b4c70 100644 --- a/src/anemoi/training/__init__.py +++ b/src/anemoi/training/__init__.py @@ -6,4 +6,4 @@ # nor does it submit to any jurisdiction. -from ._version import __version__ # noqa: F401 +#from ._version import __version__ # noqa: F401 diff --git a/src/anemoi/training/config/data/zarr.yaml b/src/anemoi/training/config/data/zarr.yaml index 1657861f..3b77bfb5 100644 --- a/src/anemoi/training/config/data/zarr.yaml +++ b/src/anemoi/training/config/data/zarr.yaml @@ -18,22 +18,22 @@ forcing: - "sin_local_time" - "insolation" - "lsm" -- "sdor" -- "slor" +#- "sdor" +#- "slor" - "z" # features that are only part of the forecast state # but are not used as the input to the model diagnostic: - tp -- cp +#- cp remapped: normalizer: default: "mean-std" min-max: max: - - "sdor" - - "slor" +# - "sdor" +# - "slor" - "z" none: - "cos_latitude" diff --git a/src/anemoi/training/config/diagnostics/eval_rollout.yaml b/src/anemoi/training/config/diagnostics/eval_rollout.yaml index 50e9a647..032faadf 100644 --- a/src/anemoi/training/config/diagnostics/eval_rollout.yaml +++ b/src/anemoi/training/config/diagnostics/eval_rollout.yaml @@ -5,7 +5,7 @@ eval: rollout: 12 frequency: 20 plot: - enabled: True + enabled: False asynchronous: True frequency: 750 sample_idx: 0 @@ -20,7 +20,7 @@ plot: - 10v - sp - tp - - cp +# - cp #Defining the accumulation levels for precipitation related fields and the colormap accumulation_levels_plot: [0, 0.05, 0.1, 0.25, 0.5, 1, 1.5, 2, 3, 4, 5, 6, 7, 100] # in mm cmap_accumulation: ["#ffffff", "#04e9e7", "#019ff4", "#0300f4", "#02fd02", "#01c501", "#008e00", "#fdf802", "#e5bc00", "#fd9500", "#fd0000", "#d40000", "#bc0000", "#f800fd"] @@ -77,7 +77,6 @@ log: log_model: False project: 'Anemoi' entity: ??? - # logger options (these probably come with some overhead) gradients: False parameters: False tensorboard: diff --git a/src/anemoi/training/config/graph/stretched_grid.yaml b/src/anemoi/training/config/graph/stretched_grid.yaml new file mode 100644 index 00000000..2f814fe2 --- /dev/null +++ b/src/anemoi/training/config/graph/stretched_grid.yaml @@ -0,0 +1,68 @@ +overwrite: False + +data: "stretched_grid" +hidden: "hidden" + +nodes: + stretched_grid: + node_builder: + _target_: anemoi.graphs.nodes.CutOutZarrDatasetNodes + lam_dataset: ${hardware.paths.data}/${hardware.files.dataset_lam} + forcing_dataset: ${hardware.paths.data}/${hardware.files.dataset} + adjust: all +# min_distance_km: 0 + attributes: + area_weight: + _target_: anemoi.graphs.nodes.attributes.AreaWeights + norm: unit-max +# lam_weights_rescale: 1.0 + hidden: + node_builder: + _target_: anemoi.graphs.nodes.StretchedTriNodes + lam_resolution: 8 + global_resolution: 5 + reference_node_name: ${graph.data} + mask_attr_name: cutout + margin_radius_km: 10 + +edges: +- source_name: ${graph.data} + target_name: ${graph.hidden} + edge_builder: + _target_: anemoi.graphs.edges.KNNEdges + num_nearest_neighbours: 12 + attributes: + edge_length: + _target_: anemoi.graphs.edges.attributes.EdgeLength + norm: unit-max + invert: True + edge_dirs: + _target_: anemoi.graphs.edges.attributes.EdgeDirection + norm: unit-std +- source_name: ${graph.hidden} + target_name: ${graph.hidden} + edge_builder: + _target_: anemoi.graphs.edges.MultiScaleEdges + x_hops: 1 + attributes: + edge_length: + _target_: anemoi.graphs.edges.attributes.EdgeLength + norm: unit-max + invert: True + edge_dirs: + _target_: anemoi.graphs.edges.attributes.EdgeDirection + norm: unit-std +- source_name: ${graph.hidden} + target_name: ${graph.data} + edge_builder: + _target_: anemoi.graphs.edges.KNNEdges + num_nearest_neighbours: 3 + attributes: + edge_length: + _target_: anemoi.graphs.edges.attributes.EdgeLength + norm: unit-max + invert: True + edge_dirs: + _target_: anemoi.graphs.edges.attributes.EdgeDirection + norm: unit-std + diff --git a/src/anemoi/training/config/stretched_grid.yaml b/src/anemoi/training/config/stretched_grid.yaml new file mode 100644 index 00000000..28e137d8 --- /dev/null +++ b/src/anemoi/training/config/stretched_grid.yaml @@ -0,0 +1,96 @@ +defaults: + - data: zarr + - dataloader: native_grid + - diagnostics: eval_rollout + - hardware: slurm + - graph: stretched_grid + - model: graphtransformer + - training: default + - _self_ + + + +dataloader: + num_workers: + training: 2 + validation: 2 + test: 2 + predict: 2 + batch_size: + training: 1 + validation: 1 + test: 1 + predict: 1 + + dataset: + cutout: + - dataset: ${hardware.paths.data}/${hardware.files.dataset_lam} + - dataset: ${hardware.paths.data}/${hardware.files.dataset} + adjust: all + + limit_batches: + training: 20 + validation: 20 + + training: + start: 2020-02-05 + end: 2022-05-31 #15 + #drop: [sdor, slor, cp] #, u_600, v_600, z_600, t_600, q_600, w_600] + statistics: ${hardware.paths.data}/ERA5/aifs-od-an-oper-0001-mars-n320-2019-2023-6h-v6.zarr +# sort_vars: True + validation: + start: 2022-06-01 + end: 2023-05-31 + #drop: [sdor, slor, cp] #, u_600, v_600, z_600, t_600, q_600, w_600] + statistics: ${hardware.paths.data}/ERA5/aifs-od-an-oper-0001-mars-n320-2019-2023-6h-v6.zarr +# sort_vars: True + test: + start: 2022-06-01 + end: 2023-05-31 + #drop: [sdor, slor, cp] #, u_600, v_600, z_600, t_600, q_600, w_600] + statistics: ${hardware.paths.data}/ERA5/aifs-od-an-oper-0001-mars-n320-2019-2023-6h-v6.zarr +# sort_vars: True + +hardware: #change these to lumi paths + num_gpus_per_node: 8 + num_nodes: 1 + num_gpus_per_model: 1 + paths: + data: /pfs/lustrep4/scratch/project_465001383/aifs/dataset/ +# output_base: /pfs/lustrep4/scratch/project_465000899/aifs/experiments/ #/lustre/storeB/project/nwp/aifs/test_output/ + output: /pfs/lustrep4/scratch/project_465001383/aifs/experiments/test-anemoi-training/ #do not change this, it will be modified in code to be output_base + run_id. + graph: /pfs/lustrep4/scratch/project_465001383/aifs/graphs/ #/lustre/storeB/project/nwp/aifs/test_graphs/ + files: + dataset: ERA5/aifs-od-an-oper-0001-mars-o96-2016-2023-6h-v6.zarr #aifs-od-an-oper-0001-mars-o96-2016-2023-6h-v6.zarr + dataset_lam: MEPS/aifs-meps-10km-2020-2024-6h-v6.zarr + graph: test-anemoi-training.pt + warm_start: null #specific checkpoint to start from, defaults to last.ckpt + +data: + resolution: None + +model: + num_channels: 512 + trainable_parameters: + data: 0 + hidden: 0 + data2hidden: 0 + hidden2data: 0 + hidden2hidden: 0 # GNN and GraphTransformer Processor only + + +graphs: + output_path: ${hardware.paths.graph}${hardware.files.graph} + save_graph_plots: False + +training: + run_id: null #path to store the experiment in with output_base as root, null for random name, =fork_run_id to continue training in the same folder. + fork_run_id: null #path to the experiment to fork from with output_base as root + load_weights_only: False #loads entire model if False, loads only weights if True + max_epochs: 50 + lr: + rate: 5.0e-6 + iterations: 10000 + min: 8.0e-6 + + diff --git a/src/anemoi/training/config/training/default.yaml b/src/anemoi/training/config/training/default.yaml index 870eeb7a..6bd90dbf 100644 --- a/src/anemoi/training/config/training/default.yaml +++ b/src/anemoi/training/config/training/default.yaml @@ -71,7 +71,7 @@ loss_scaling: 10v: 0.1 2d: 0.5 tp: 0.025 - cp: 0.0025 +# cp: 0.0025 metrics: - z_500 diff --git a/src/lumi_train.py b/src/lumi_train.py new file mode 100644 index 00000000..6300f150 --- /dev/null +++ b/src/lumi_train.py @@ -0,0 +1,9 @@ +from hydra import compose, initialize +from anemoi.training.train.train import AnemoiTrainer + +with initialize(version_base=None, config_path="anemoi/training/config"): + config = compose(config_name="stretched_grid") + +T = AnemoiTrainer(config) + +T.train() \ No newline at end of file