Skip to content

Commit

Permalink
Config for stretched grid with needed adjustments
Browse files Browse the repository at this point in the history
  • Loading branch information
havardhhaugen committed Oct 21, 2024
1 parent fe3bbba commit 2a7fb15
Show file tree
Hide file tree
Showing 8 changed files with 230 additions and 10 deletions.
48 changes: 48 additions & 0 deletions lumi_train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash
#SBATCH --output=/scratch/project_465001383/aifs/logs/name2.out
#SBATCH --error=/scratch/project_465001383/aifs/logs/name2.err
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --account=project_465001383
#SBATCH --partition=dev-g
#SBATCH --gpus-per-node=8
#SBATCH --time=00:15:00
#SBATCH --job-name=aifs
#SBATCH --exclusive

PROJECT_DIR=/pfs/lustrep4/scratch/$SLURM_JOB_ACCOUNT
CONTAINER_SCRIPT=$PROJECT_DIR/aifs/run-pytorch/run-pytorch.sh

#CHANGE THESE:
CONTAINER=$PROJECT_DIR/aifs/container/containers/aifs-met-pytorch-2.2.0-rocm-5.6.1-py3.9-v2.0-new-correct-anemoi-models-sort-vars.sif
PYTHON_SCRIPT=$PROJECT_DIR/haugenha/anemoi-training-setup/anemoi-training-config/anemoi-training/src/lumi_train.py
VENV=/users/haugenha/work/.venv-anemoi-training


module load LUMI/23.09 partition/G

export SINGULARITYENV_LD_LIBRARY_PATH=/opt/ompi/lib:${EBROOTAWSMINOFIMINRCCL}/lib:/opt/cray/xpmem/2.4.4-2.3_9.1__gff0e1d9.shasta/lib64:${SINGULARITYENV_LD_LIBRARY_PATH}

# MPI + OpenMP bindings: https://docs.lumi-supercomputer.eu/runjobs/scheduled-jobs/distribution-binding
CPU_BIND="mask_cpu:fe000000000000,fe00000000000000,fe0000,fe000000,fe,fe00,fe00000000,fe0000000000"

if [[ "$VENV" != "None" && -n "$VENV" ]]; then
# Set this virtual environment
export VIRTUAL_ENV=$VENV

# Ensure the virtual environment is loaded inside the container
export PYTHONUSERBASE=$VIRTUAL_ENV
export PATH=$PATH:$VIRTUAL_ENV/bin
else
:
fi

# run run-pytorch.sh in singularity container like recommended
# in LUMI doc: https://lumi-supercomputer.github.io/LUMI-EasyBuild-docs/p/PyTorch
srun --cpu-bind=$CPU_BIND \
singularity exec -B /pfs:/pfs \
-B /var/spool/slurmd \
-B /opt/cray \
-B /usr/lib64 \
-B /usr/lib64/libjansson.so.4 \
$CONTAINER $CONTAINER_SCRIPT $PYTHON_SCRIPT
2 changes: 1 addition & 1 deletion src/anemoi/training/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# nor does it submit to any jurisdiction.


from ._version import __version__ # noqa: F401
#from ._version import __version__ # noqa: F401
10 changes: 5 additions & 5 deletions src/anemoi/training/config/data/zarr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,22 @@ forcing:
- "sin_local_time"
- "insolation"
- "lsm"
- "sdor"
- "slor"
#- "sdor"
#- "slor"
- "z"
# features that are only part of the forecast state
# but are not used as the input to the model
diagnostic:
- tp
- cp
#- cp
remapped:

normalizer:
default: "mean-std"
min-max:
max:
- "sdor"
- "slor"
# - "sdor"
# - "slor"
- "z"
none:
- "cos_latitude"
Expand Down
5 changes: 2 additions & 3 deletions src/anemoi/training/config/diagnostics/eval_rollout.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ eval:
rollout: 12
frequency: 20
plot:
enabled: True
enabled: False
asynchronous: True
frequency: 750
sample_idx: 0
Expand All @@ -20,7 +20,7 @@ plot:
- 10v
- sp
- tp
- cp
# - cp
#Defining the accumulation levels for precipitation related fields and the colormap
accumulation_levels_plot: [0, 0.05, 0.1, 0.25, 0.5, 1, 1.5, 2, 3, 4, 5, 6, 7, 100] # in mm
cmap_accumulation: ["#ffffff", "#04e9e7", "#019ff4", "#0300f4", "#02fd02", "#01c501", "#008e00", "#fdf802", "#e5bc00", "#fd9500", "#fd0000", "#d40000", "#bc0000", "#f800fd"]
Expand Down Expand Up @@ -77,7 +77,6 @@ log:
log_model: False
project: 'Anemoi'
entity: ???
# logger options (these probably come with some overhead)
gradients: False
parameters: False
tensorboard:
Expand Down
68 changes: 68 additions & 0 deletions src/anemoi/training/config/graph/stretched_grid.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
overwrite: False

data: "stretched_grid"
hidden: "hidden"

nodes:
stretched_grid:
node_builder:
_target_: anemoi.graphs.nodes.CutOutZarrDatasetNodes
lam_dataset: ${hardware.paths.data}/${hardware.files.dataset_lam}
forcing_dataset: ${hardware.paths.data}/${hardware.files.dataset}
adjust: all
# min_distance_km: 0
attributes:
area_weight:
_target_: anemoi.graphs.nodes.attributes.AreaWeights
norm: unit-max
# lam_weights_rescale: 1.0
hidden:
node_builder:
_target_: anemoi.graphs.nodes.StretchedTriNodes
lam_resolution: 8
global_resolution: 5
reference_node_name: ${graph.data}
mask_attr_name: cutout
margin_radius_km: 10

edges:
- source_name: ${graph.data}
target_name: ${graph.hidden}
edge_builder:
_target_: anemoi.graphs.edges.KNNEdges
num_nearest_neighbours: 12
attributes:
edge_length:
_target_: anemoi.graphs.edges.attributes.EdgeLength
norm: unit-max
invert: True
edge_dirs:
_target_: anemoi.graphs.edges.attributes.EdgeDirection
norm: unit-std
- source_name: ${graph.hidden}
target_name: ${graph.hidden}
edge_builder:
_target_: anemoi.graphs.edges.MultiScaleEdges
x_hops: 1
attributes:
edge_length:
_target_: anemoi.graphs.edges.attributes.EdgeLength
norm: unit-max
invert: True
edge_dirs:
_target_: anemoi.graphs.edges.attributes.EdgeDirection
norm: unit-std
- source_name: ${graph.hidden}
target_name: ${graph.data}
edge_builder:
_target_: anemoi.graphs.edges.KNNEdges
num_nearest_neighbours: 3
attributes:
edge_length:
_target_: anemoi.graphs.edges.attributes.EdgeLength
norm: unit-max
invert: True
edge_dirs:
_target_: anemoi.graphs.edges.attributes.EdgeDirection
norm: unit-std

96 changes: 96 additions & 0 deletions src/anemoi/training/config/stretched_grid.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
defaults:
- data: zarr
- dataloader: native_grid
- diagnostics: eval_rollout
- hardware: slurm
- graph: stretched_grid
- model: graphtransformer
- training: default
- _self_



dataloader:
num_workers:
training: 2
validation: 2
test: 2
predict: 2
batch_size:
training: 1
validation: 1
test: 1
predict: 1

dataset:
cutout:
- dataset: ${hardware.paths.data}/${hardware.files.dataset_lam}
- dataset: ${hardware.paths.data}/${hardware.files.dataset}
adjust: all

limit_batches:
training: 20
validation: 20

training:
start: 2020-02-05
end: 2022-05-31 #15
#drop: [sdor, slor, cp] #, u_600, v_600, z_600, t_600, q_600, w_600]
statistics: ${hardware.paths.data}/ERA5/aifs-od-an-oper-0001-mars-n320-2019-2023-6h-v6.zarr
# sort_vars: True
validation:
start: 2022-06-01
end: 2023-05-31
#drop: [sdor, slor, cp] #, u_600, v_600, z_600, t_600, q_600, w_600]
statistics: ${hardware.paths.data}/ERA5/aifs-od-an-oper-0001-mars-n320-2019-2023-6h-v6.zarr
# sort_vars: True
test:
start: 2022-06-01
end: 2023-05-31
#drop: [sdor, slor, cp] #, u_600, v_600, z_600, t_600, q_600, w_600]
statistics: ${hardware.paths.data}/ERA5/aifs-od-an-oper-0001-mars-n320-2019-2023-6h-v6.zarr
# sort_vars: True

hardware: #change these to lumi paths
num_gpus_per_node: 8
num_nodes: 1
num_gpus_per_model: 1
paths:
data: /pfs/lustrep4/scratch/project_465001383/aifs/dataset/
# output_base: /pfs/lustrep4/scratch/project_465000899/aifs/experiments/ #/lustre/storeB/project/nwp/aifs/test_output/
output: /pfs/lustrep4/scratch/project_465001383/aifs/experiments/test-anemoi-training/ #do not change this, it will be modified in code to be output_base + run_id.
graph: /pfs/lustrep4/scratch/project_465001383/aifs/graphs/ #/lustre/storeB/project/nwp/aifs/test_graphs/
files:
dataset: ERA5/aifs-od-an-oper-0001-mars-o96-2016-2023-6h-v6.zarr #aifs-od-an-oper-0001-mars-o96-2016-2023-6h-v6.zarr
dataset_lam: MEPS/aifs-meps-10km-2020-2024-6h-v6.zarr
graph: test-anemoi-training.pt
warm_start: null #specific checkpoint to start from, defaults to last.ckpt

data:
resolution: None

model:
num_channels: 512
trainable_parameters:
data: 0
hidden: 0
data2hidden: 0
hidden2data: 0
hidden2hidden: 0 # GNN and GraphTransformer Processor only


graphs:
output_path: ${hardware.paths.graph}${hardware.files.graph}
save_graph_plots: False

training:
run_id: null #path to store the experiment in with output_base as root, null for random name, =fork_run_id to continue training in the same folder.
fork_run_id: null #path to the experiment to fork from with output_base as root
load_weights_only: False #loads entire model if False, loads only weights if True
max_epochs: 50
lr:
rate: 5.0e-6
iterations: 10000
min: 8.0e-6


2 changes: 1 addition & 1 deletion src/anemoi/training/config/training/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ loss_scaling:
10v: 0.1
2d: 0.5
tp: 0.025
cp: 0.0025
# cp: 0.0025

metrics:
- z_500
Expand Down
9 changes: 9 additions & 0 deletions src/lumi_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from hydra import compose, initialize
from anemoi.training.train.train import AnemoiTrainer

with initialize(version_base=None, config_path="anemoi/training/config"):
config = compose(config_name="stretched_grid")

T = AnemoiTrainer(config)

T.train()

0 comments on commit 2a7fb15

Please sign in to comment.