forked from ecmwf/anemoi-training
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Config for stretched grid with needed adjustments
- Loading branch information
1 parent
fe3bbba
commit 2a7fb15
Showing
8 changed files
with
230 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#!/bin/bash | ||
#SBATCH --output=/scratch/project_465001383/aifs/logs/name2.out | ||
#SBATCH --error=/scratch/project_465001383/aifs/logs/name2.err | ||
#SBATCH --nodes=1 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --account=project_465001383 | ||
#SBATCH --partition=dev-g | ||
#SBATCH --gpus-per-node=8 | ||
#SBATCH --time=00:15:00 | ||
#SBATCH --job-name=aifs | ||
#SBATCH --exclusive | ||
|
||
PROJECT_DIR=/pfs/lustrep4/scratch/$SLURM_JOB_ACCOUNT | ||
CONTAINER_SCRIPT=$PROJECT_DIR/aifs/run-pytorch/run-pytorch.sh | ||
|
||
#CHANGE THESE: | ||
CONTAINER=$PROJECT_DIR/aifs/container/containers/aifs-met-pytorch-2.2.0-rocm-5.6.1-py3.9-v2.0-new-correct-anemoi-models-sort-vars.sif | ||
PYTHON_SCRIPT=$PROJECT_DIR/haugenha/anemoi-training-setup/anemoi-training-config/anemoi-training/src/lumi_train.py | ||
VENV=/users/haugenha/work/.venv-anemoi-training | ||
|
||
|
||
module load LUMI/23.09 partition/G | ||
|
||
export SINGULARITYENV_LD_LIBRARY_PATH=/opt/ompi/lib:${EBROOTAWSMINOFIMINRCCL}/lib:/opt/cray/xpmem/2.4.4-2.3_9.1__gff0e1d9.shasta/lib64:${SINGULARITYENV_LD_LIBRARY_PATH} | ||
|
||
# MPI + OpenMP bindings: https://docs.lumi-supercomputer.eu/runjobs/scheduled-jobs/distribution-binding | ||
CPU_BIND="mask_cpu:fe000000000000,fe00000000000000,fe0000,fe000000,fe,fe00,fe00000000,fe0000000000" | ||
|
||
if [[ "$VENV" != "None" && -n "$VENV" ]]; then | ||
# Set this virtual environment | ||
export VIRTUAL_ENV=$VENV | ||
|
||
# Ensure the virtual environment is loaded inside the container | ||
export PYTHONUSERBASE=$VIRTUAL_ENV | ||
export PATH=$PATH:$VIRTUAL_ENV/bin | ||
else | ||
: | ||
fi | ||
|
||
# run run-pytorch.sh in singularity container like recommended | ||
# in LUMI doc: https://lumi-supercomputer.github.io/LUMI-EasyBuild-docs/p/PyTorch | ||
srun --cpu-bind=$CPU_BIND \ | ||
singularity exec -B /pfs:/pfs \ | ||
-B /var/spool/slurmd \ | ||
-B /opt/cray \ | ||
-B /usr/lib64 \ | ||
-B /usr/lib64/libjansson.so.4 \ | ||
$CONTAINER $CONTAINER_SCRIPT $PYTHON_SCRIPT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
overwrite: False | ||
|
||
data: "stretched_grid" | ||
hidden: "hidden" | ||
|
||
nodes: | ||
stretched_grid: | ||
node_builder: | ||
_target_: anemoi.graphs.nodes.CutOutZarrDatasetNodes | ||
lam_dataset: ${hardware.paths.data}/${hardware.files.dataset_lam} | ||
forcing_dataset: ${hardware.paths.data}/${hardware.files.dataset} | ||
adjust: all | ||
# min_distance_km: 0 | ||
attributes: | ||
area_weight: | ||
_target_: anemoi.graphs.nodes.attributes.AreaWeights | ||
norm: unit-max | ||
# lam_weights_rescale: 1.0 | ||
hidden: | ||
node_builder: | ||
_target_: anemoi.graphs.nodes.StretchedTriNodes | ||
lam_resolution: 8 | ||
global_resolution: 5 | ||
reference_node_name: ${graph.data} | ||
mask_attr_name: cutout | ||
margin_radius_km: 10 | ||
|
||
edges: | ||
- source_name: ${graph.data} | ||
target_name: ${graph.hidden} | ||
edge_builder: | ||
_target_: anemoi.graphs.edges.KNNEdges | ||
num_nearest_neighbours: 12 | ||
attributes: | ||
edge_length: | ||
_target_: anemoi.graphs.edges.attributes.EdgeLength | ||
norm: unit-max | ||
invert: True | ||
edge_dirs: | ||
_target_: anemoi.graphs.edges.attributes.EdgeDirection | ||
norm: unit-std | ||
- source_name: ${graph.hidden} | ||
target_name: ${graph.hidden} | ||
edge_builder: | ||
_target_: anemoi.graphs.edges.MultiScaleEdges | ||
x_hops: 1 | ||
attributes: | ||
edge_length: | ||
_target_: anemoi.graphs.edges.attributes.EdgeLength | ||
norm: unit-max | ||
invert: True | ||
edge_dirs: | ||
_target_: anemoi.graphs.edges.attributes.EdgeDirection | ||
norm: unit-std | ||
- source_name: ${graph.hidden} | ||
target_name: ${graph.data} | ||
edge_builder: | ||
_target_: anemoi.graphs.edges.KNNEdges | ||
num_nearest_neighbours: 3 | ||
attributes: | ||
edge_length: | ||
_target_: anemoi.graphs.edges.attributes.EdgeLength | ||
norm: unit-max | ||
invert: True | ||
edge_dirs: | ||
_target_: anemoi.graphs.edges.attributes.EdgeDirection | ||
norm: unit-std | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
defaults: | ||
- data: zarr | ||
- dataloader: native_grid | ||
- diagnostics: eval_rollout | ||
- hardware: slurm | ||
- graph: stretched_grid | ||
- model: graphtransformer | ||
- training: default | ||
- _self_ | ||
|
||
|
||
|
||
dataloader: | ||
num_workers: | ||
training: 2 | ||
validation: 2 | ||
test: 2 | ||
predict: 2 | ||
batch_size: | ||
training: 1 | ||
validation: 1 | ||
test: 1 | ||
predict: 1 | ||
|
||
dataset: | ||
cutout: | ||
- dataset: ${hardware.paths.data}/${hardware.files.dataset_lam} | ||
- dataset: ${hardware.paths.data}/${hardware.files.dataset} | ||
adjust: all | ||
|
||
limit_batches: | ||
training: 20 | ||
validation: 20 | ||
|
||
training: | ||
start: 2020-02-05 | ||
end: 2022-05-31 #15 | ||
#drop: [sdor, slor, cp] #, u_600, v_600, z_600, t_600, q_600, w_600] | ||
statistics: ${hardware.paths.data}/ERA5/aifs-od-an-oper-0001-mars-n320-2019-2023-6h-v6.zarr | ||
# sort_vars: True | ||
validation: | ||
start: 2022-06-01 | ||
end: 2023-05-31 | ||
#drop: [sdor, slor, cp] #, u_600, v_600, z_600, t_600, q_600, w_600] | ||
statistics: ${hardware.paths.data}/ERA5/aifs-od-an-oper-0001-mars-n320-2019-2023-6h-v6.zarr | ||
# sort_vars: True | ||
test: | ||
start: 2022-06-01 | ||
end: 2023-05-31 | ||
#drop: [sdor, slor, cp] #, u_600, v_600, z_600, t_600, q_600, w_600] | ||
statistics: ${hardware.paths.data}/ERA5/aifs-od-an-oper-0001-mars-n320-2019-2023-6h-v6.zarr | ||
# sort_vars: True | ||
|
||
hardware: #change these to lumi paths | ||
num_gpus_per_node: 8 | ||
num_nodes: 1 | ||
num_gpus_per_model: 1 | ||
paths: | ||
data: /pfs/lustrep4/scratch/project_465001383/aifs/dataset/ | ||
# output_base: /pfs/lustrep4/scratch/project_465000899/aifs/experiments/ #/lustre/storeB/project/nwp/aifs/test_output/ | ||
output: /pfs/lustrep4/scratch/project_465001383/aifs/experiments/test-anemoi-training/ #do not change this, it will be modified in code to be output_base + run_id. | ||
graph: /pfs/lustrep4/scratch/project_465001383/aifs/graphs/ #/lustre/storeB/project/nwp/aifs/test_graphs/ | ||
files: | ||
dataset: ERA5/aifs-od-an-oper-0001-mars-o96-2016-2023-6h-v6.zarr #aifs-od-an-oper-0001-mars-o96-2016-2023-6h-v6.zarr | ||
dataset_lam: MEPS/aifs-meps-10km-2020-2024-6h-v6.zarr | ||
graph: test-anemoi-training.pt | ||
warm_start: null #specific checkpoint to start from, defaults to last.ckpt | ||
|
||
data: | ||
resolution: None | ||
|
||
model: | ||
num_channels: 512 | ||
trainable_parameters: | ||
data: 0 | ||
hidden: 0 | ||
data2hidden: 0 | ||
hidden2data: 0 | ||
hidden2hidden: 0 # GNN and GraphTransformer Processor only | ||
|
||
|
||
graphs: | ||
output_path: ${hardware.paths.graph}${hardware.files.graph} | ||
save_graph_plots: False | ||
|
||
training: | ||
run_id: null #path to store the experiment in with output_base as root, null for random name, =fork_run_id to continue training in the same folder. | ||
fork_run_id: null #path to the experiment to fork from with output_base as root | ||
load_weights_only: False #loads entire model if False, loads only weights if True | ||
max_epochs: 50 | ||
lr: | ||
rate: 5.0e-6 | ||
iterations: 10000 | ||
min: 8.0e-6 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,7 +71,7 @@ loss_scaling: | |
10v: 0.1 | ||
2d: 0.5 | ||
tp: 0.025 | ||
cp: 0.0025 | ||
# cp: 0.0025 | ||
|
||
metrics: | ||
- z_500 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from hydra import compose, initialize | ||
from anemoi.training.train.train import AnemoiTrainer | ||
|
||
with initialize(version_base=None, config_path="anemoi/training/config"): | ||
config = compose(config_name="stretched_grid") | ||
|
||
T = AnemoiTrainer(config) | ||
|
||
T.train() |