From 4d2cc3bf2845ce08e68b0f3206bcb0edf0659985 Mon Sep 17 00:00:00 2001 From: Philip Chmielowiec <67855069+philipc2@users.noreply.github.com> Date: Thu, 12 Dec 2024 04:33:30 -0600 Subject: [PATCH] Fix bug in `_map_dims_to_ugrid`, use Polars to improve SCRIP reader performance (#1109) * remove check for n_edge which was constructing connectivity * use polars for unique calls in SCRIP reader --- ci/asv.yml | 1 + ci/docs.yml | 1 + ci/environment.yml | 1 + pyproject.toml | 1 + uxarray/core/utils.py | 13 ++++++++---- uxarray/io/_scrip.py | 46 +++++++++++++++++++++++-------------------- 6 files changed, 38 insertions(+), 25 deletions(-) diff --git a/ci/asv.yml b/ci/asv.yml index cbe442b33..2c4089646 100644 --- a/ci/asv.yml +++ b/ci/asv.yml @@ -17,6 +17,7 @@ dependencies: - pandas - pathlib - pre_commit + - polars - pyarrow - pytest - pytest-cov diff --git a/ci/docs.yml b/ci/docs.yml index a7411af73..6fcac9726 100644 --- a/ci/docs.yml +++ b/ci/docs.yml @@ -34,6 +34,7 @@ dependencies: - pandas - geocat-datafiles - spatialpandas + - polars - geopandas - pip: - antimeridian diff --git a/ci/environment.yml b/ci/environment.yml index a696e397a..32cc0c8e9 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -21,6 +21,7 @@ dependencies: - pandas - pathlib - pre_commit + - polars - pyarrow - pytest - pip diff --git a/pyproject.toml b/pyproject.toml index 169689fcd..6a0ed0b37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ dependencies = [ "geopandas", "xarray", "hvplot", + "polars", ] # minimal dependencies end diff --git a/uxarray/core/utils.py b/uxarray/core/utils.py index 7b2292308..fdd38c5fa 100644 --- a/uxarray/core/utils.py +++ b/uxarray/core/utils.py @@ -39,16 +39,21 @@ def _map_dims_to_ugrid( # drop dimensions not present in the original dataset _source_dims_dict.pop(key) + # only check edge dimension if it is present (to avoid overhead of computing connectivity) + if "n_edge" in grid._ds.dims: + n_edge = grid._ds.sizes["n_edge"] + else: + n_edge = None + for dim in set(ds.dims) ^ _source_dims_dict.keys(): # obtain dimensions that were not parsed source_dims_dict and attempt to match to a grid element if ds.sizes[dim] == grid.n_face: _source_dims_dict[dim] = "n_face" elif ds.sizes[dim] == grid.n_node: _source_dims_dict[dim] = "n_node" - elif ds.sizes[dim] == grid.n_edge: - _source_dims_dict[dim] = "n_edge" - - # Possible Issue: https://github.com/UXARRAY/uxarray/issues/610 + elif n_edge is not None: + if ds.sizes[dim] == n_edge: + _source_dims_dict[dim] = "n_edge" # rename dimensions to follow the UGRID conventions ds = ds.swap_dims(_source_dims_dict) diff --git a/uxarray/io/_scrip.py b/uxarray/io/_scrip.py index c07824147..f6b874e93 100644 --- a/uxarray/io/_scrip.py +++ b/uxarray/io/_scrip.py @@ -1,6 +1,8 @@ import xarray as xr import numpy as np +import polars as pl + from uxarray.grid.connectivity import _replace_fill_values from uxarray.constants import INT_DTYPE, INT_FILL_VALUE @@ -11,43 +13,45 @@ def _to_ugrid(in_ds, out_ds): """If input dataset (``in_ds``) file is an unstructured SCRIP file, function will reassign SCRIP variables to UGRID conventions in output file (``out_ds``). - - Parameters - ---------- - in_ds : xarray.Dataset - Original scrip dataset of interest being used - - out_ds : xarray.Variable - file to be returned by ``_populate_scrip_data``, used as an empty placeholder file - to store reassigned SCRIP variables in UGRID conventions """ source_dims_dict = {} if in_ds["grid_area"].all(): # Create node_lon & node_lat variables from grid_corner_lat/lon - # Turn latitude scrip array into 1D instead of 2D + # Turn latitude and longitude scrip arrays into 1D corner_lat = in_ds["grid_corner_lat"].values.ravel() - - # Repeat above steps with longitude data instead corner_lon = in_ds["grid_corner_lon"].values.ravel() - # Combine flat lat and lon arrays - corner_lon_lat = np.vstack((corner_lon, corner_lat)).T + # Use Polars to find unique coordinate pairs + df = pl.DataFrame({"lon": corner_lon, "lat": corner_lat}).with_row_count( + "original_index" + ) + + # Get unique rows (first occurrence). This preserves the order in which they appear. + unique_df = df.unique(subset=["lon", "lat"], keep="first") + + # unq_ind: The indices of the unique rows in the original array + unq_ind = unique_df["original_index"].to_numpy().astype(INT_DTYPE) + + # To get the inverse index (unq_inv): map each original row back to its unique row index. + # Add a unique_id to the unique_df which will serve as the "inverse" mapping. + unique_df = unique_df.with_row_count("unique_id") - # Run numpy unique to determine which rows/values are actually unique - _, unq_ind, unq_inv = np.unique( - corner_lon_lat, return_index=True, return_inverse=True, axis=0 + # Join original df with unique_df to find out which unique_id corresponds to each row + df_joined = df.join( + unique_df.drop("original_index"), on=["lon", "lat"], how="left" ) + unq_inv = df_joined["unique_id"].to_numpy().astype(INT_DTYPE) - # Now, calculate unique lon and lat values to account for 'node_lon' and 'node_lat' - unq_lon = corner_lon_lat[unq_ind, :][:, 0] - unq_lat = corner_lon_lat[unq_ind, :][:, 1] + # Extract unique lon and lat values using unq_ind + unq_lon = corner_lon[unq_ind] + unq_lat = corner_lat[unq_ind] # Reshape face nodes array into original shape for use in 'face_node_connectivity' unq_inv = np.reshape(unq_inv, (len(in_ds.grid_size), len(in_ds.grid_corners))) - # Create node_lon & node_lat from unsorted, unique grid_corner_lat/lon + # Create node_lon & node_lat out_ds[ugrid.NODE_COORDINATES[0]] = xr.DataArray( unq_lon, dims=[ugrid.NODE_DIM], attrs=ugrid.NODE_LON_ATTRS )