Skip to content

Commit

Permalink
implement cropping during dataset creation
Browse files Browse the repository at this point in the history
  • Loading branch information
leifdenby committed Dec 12, 2024
1 parent aed4d22 commit 972186d
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 0 deletions.
109 changes: 109 additions & 0 deletions example.era5_cropped.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
schema_version: v0.5.0
dataset_version: v1.0.0

output:
variables:
static: [grid_index, static_feature]
forcing: [time, grid_index, forcing_feature]
coord_ranges:
time:
start: 1990-09-03T00:00
end: 1990-09-09T00:00
step: PT6H
chunking:
time: 1
splitting:
dim: time
splits:
train:
start: 1990-09-03T00:00
end: 1990-09-06T00:00
compute_statistics:
ops: [mean, std, diff_mean, diff_std]
dims: [grid_index, time]
val:
start: 1990-09-06T00:00
end: 1990-09-07T00:00
test:
start: 1990-09-07T00:00
end: 1990-09-09T00:00
domain_cropping:
margin_width_degrees: 0.2
interior_dataset_config_path: example.danra.yaml

inputs:
era_height_levels:
path: 'gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr'
dims: [time, longitude, latitude, level]
variables:
u_component_of_wind:
level:
values: [1000,]
units: hPa
dim_mapping:
time:
method: rename
dim: time
x:
method: rename
dim: longitude
y:
method: rename
dim: latitude
forcing_feature:
method: stack_variables_by_var_name
dims: [level]
name_format: "{var_name}{level}hPa"
grid_index:
method: stack
dims: [x, y]
target_output_variable: forcing

era5_surface:
path: 'gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr'
dims: [time, longitude, latitude, level]
variables:
- mean_sea_level_pressure
dim_mapping:
time:
method: rename
dim: time
x:
method: rename
dim: longitude
y:
method: rename
dim: latitude
forcing_feature:
method: stack_variables_by_var_name
name_format: "{var_name}"
grid_index:
method: stack
dims: [x, y]
target_output_variable: forcing

era5_static:
path: 'gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr'
dims: [time, longitude, latitude, level]
variables:
- land_sea_mask
dim_mapping:
x:
method: rename
dim: longitude
y:
method: rename
dim: latitude
static_feature:
method: stack_variables_by_var_name
name_format: "{var_name}"
grid_index:
method: stack
dims: [x, y]
target_output_variable: static

extra:
projection:
class_name: PlateCarree
kwargs:
central_longitude: 0.0
20 changes: 20 additions & 0 deletions mllam_data_prep/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,25 @@ class Splitting:
splits: Dict[str, Split]


@dataclass
class ConvexHullPaddingCrop:
"""
Define the method for cropping the domain of the output dataset, this includes defining
the method to use for cropping the domain and the parameters for the method.
Attributes
----------
margin_width_degrees: float
The width (in degrees) of the margin applied to the convex hull
boundary of the interior dataset used to define the cropping domain.
interior_dataset_config_path: str
The path to the configuration file for the dataset defining the interior domain
"""

margin_width_degrees: float
interior_dataset_config_path: str


@dataclass
class Output:
"""
Expand Down Expand Up @@ -260,6 +279,7 @@ class Output:
coord_ranges: Dict[str, Range] = None
chunking: Dict[str, int] = None
splitting: Splitting = None
domain_cropping: ConvexHullPaddingCrop = None


@dataclass
Expand Down
21 changes: 21 additions & 0 deletions mllam_data_prep/create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from . import __version__
from .config import Config, InvalidConfigException
from .ops.cropping import crop_to_within_convex_hull_margin
from .ops.loading import load_and_subset_dataset
from .ops.mapping import map_dims_and_variables
from .ops.selection import select_by_kwargs
Expand Down Expand Up @@ -118,6 +119,14 @@ def create_dataset(config: Config):
"update the schema version used in your config to v0.5.0."
)

# parse the interior domain config already here if domain cropping is
# enabled, so that we can alert the user quickly if the config is invalid
ds_interior_domain = None
if config.output.domain_cropping is not None:
config_interior_domain = Config.from_yaml_file(
file=config.output.domain_cropping.interior_dataset_config_path
)

output_config = config.output
output_coord_ranges = output_config.coord_ranges

Expand Down Expand Up @@ -238,6 +247,18 @@ def create_dataset(config: Config):
if d not in ds.coords:
ds[d] = np.arange(ds[d].size)

if config.output.domain_cropping is not None:
domain_cropping = config.output.domain_cropping
ds_interior_domain = create_dataset(config=config_interior_domain)
logger.info(
f"Cropping dataset to within convex hull margin of {ds_interior_domain} with a margin of {domain_cropping.margin_width_degrees} degrees"
)
ds = crop_to_within_convex_hull_margin(
ds=ds,
ds_reference=ds_interior_domain,
max_dist=domain_cropping.margin_width_degrees,
)

ds.attrs = {}
ds.attrs["schema_version"] = config.schema_version
ds.attrs["dataset_version"] = config.dataset_version
Expand Down
3 changes: 3 additions & 0 deletions mllam_data_prep/ops/cropping.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def _get_latlon_coords(da: xr.DataArray) -> tuple:
elif "lat" in da.coords and "lon" in da.coords:
return (da.lat, da.lon)
else:
import ipdb

ipdb.set_trace()
raise Exception("Could not find lat/lon coordinates in DataArray.")


Expand Down

0 comments on commit 972186d

Please sign in to comment.