Merge pull request #60 from coecms/point_workflow

Point workflow
coecms · Feb 20, 2023 · 91c0a4d · 91c0a4d
2 parents afdcb22 + 1bdc700
commit 91c0a4d
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 46 deletions.
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "1.0.0" %}
+{% set version = "0.9.2" %}
 package:
     name: xmhw 
     version: {{ version }}
@@ -30,7 +30,7 @@ requirements:
 
 source:
     git_url:  https://github.com/coecms/xmhw.git 
-    git_rev: 1.0.0
+    git_rev: 0.9.2
 
 build:
     script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed"

diff --git a/docs/xmhw_demo.ipynb b/docs/xmhw_demo.ipynb
@@ -3324,7 +3324,7 @@
     "````\n",
     " threshold(temp, tdim='time', climatologyPeriod=[None,None], pctile=90, windowHalfWidth=5,  \n",
     "           smoothPercentile=True, smoothPercentileWidth=31, maxPadLength=None, \n",
-    "           coldSpells=False, Ly=False, anynans=False, skipna=False, point=False):\n",
+    "           coldSpells=False, Ly=False, anynans=False, skipna=False):\n",
     "````\n",
     "\n",
     "Where *temp* is the temperature timeseries, this is the only input needed. Arguments names are the same as the original MarineHeatWave code, where possible:<br>\n",
@@ -3368,9 +3368,6 @@
     " * **skipna**: bool, optional \n",
     "        If True percentile and mean function will use skipna=True.\n",
     "        Using skipna option is much slower (default is False)<br>\n",
-    " * **point**: bool, optional\n",
-    "        Set True if timeseries is a point, i.e. it has only time dimension,\n",
-    "        it will skip grid parallelisation (default is False)\n",
     "\n",
     "More on missing values later."
    ]
@@ -3430,18 +3427,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If working with a single-point timeseries, there is no need to check for land points or parallelise the workflow, hence from version 1.0 we introduced a new flag `point` to indicate that the timeseries has only a time dimension. Point by default is False."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "clim = threshold(sst_single_point, point=True)\n",
-    "mhw = detect(sst_single_point, clim['thresh'], clim['seas'], point=True)\n",
-    "block = block_average(mhw, point=True)"
+    "If working with a single-point timeseries, there is no need to check for land points or parallelise the workflow, hence from version 0.9.2 the code will check for the number of dimensions and when this is equal to one, it assumes it is the time dimension and skip all grid operations, including land_check."
    ]
   },
   {
@@ -3477,7 +3463,7 @@
     "````\n",
     "    def detect(temp, th, se, tdim='time', minDuration=5, joinGaps=True, \n",
     "               maxGap=2, maxPadLength=None, coldSpells=False,\n",
-    "               intermediate=False, anynans=False, point=False):\n",
+    "               intermediate=False, anynans=False):\n",
     "\n",
     "````\n",
     "Apart from the timeseries, the threshold and the seasonal average, the other arguments are all optional.<br>\n",

diff --git a/test/test_stats.py b/test/test_stats.py
@@ -77,9 +77,9 @@ def test_check_variables(inter_data):
 def test_check_coordinates(inter_data):
     # with introduction of single point timeseries skip this particular test doesn't work as it was
     #inter_stack = land_check(inter_data, tdim="index")
-    outds, coord = check_coordinates(inter_data, True)
+    outds, coord = check_coordinates(inter_data)
     #xrtest.assert_equal(inter_stack, outds)
-    assert coord == None
+    assert coord == 'point' 
     #stacked = inter_stack.rename({"cell": "other"})
     #outds, coord = check_coordinates(stacked)
     #xrtest.assert_equal(stacked, outds)

diff --git a/xmhw/stats.py b/xmhw/stats.py
@@ -32,7 +32,6 @@ def block_average(
     mtime="time_start",
     removeMissing=False,
     split=False,
-    point=False,
 ):
     """Calculate statistics like averages, mean and maximum on blocks of years.
 
@@ -66,9 +65,6 @@ def block_average(
         (default is False)
     split: bool, optional
         (default is False)
-    point: bool, optional
-        Set True if timeseries is a point, i.e. it has only time dimension,
-        it will skip grid parallelisation (default is False)
 
     Returns
     -------
@@ -101,7 +97,9 @@ def block_average(
     # is already present, then we do not need clims
     if dstime is not None:
         dstime, sw_cats, sw_temp = check_variables(dstime)
-        dstime, stack_coord = check_coordinates(dstime, point)
+        dstime, stack_coord = check_coordinates(dstime)
+        if stack_coord is None:
+            point = True
         period = [
             dstime.time.dt.year[0].values,
             dstime.time.dt.year[-1].values,
@@ -136,15 +134,15 @@ def block_average(
     blockls = []
     # this defines years array to use to groupby arrays
     tgroup = mhw[mtime].dt.year
-    if point:
+    if stack_coord == 'point':
         blockls.append(call_groupby(mhw, tgroup, bins))
     else:
         # remove land and stack on cell
         mhw = land_check(mhw, tdim="events")
         for c in mhw.cell:
             blockls.append(call_groupby(mhw.sel(cell=c), tgroup, bins))
     results = dask.compute(blockls)
-    if point:
+    if stack_coord == 'point':
         block = blockls[0]
     else:
         block = xr.concat(results[0], dim=mhw.cell).unstack("cell")
@@ -162,7 +160,7 @@ def block_average(
         #tgroup = dstime.isel({stack_coord: 0}).time.dt.year
         tgroup = dstime.time.dt.year
         statsls = []
-        if point:
+        if stack_coord == 'point':
             statsls.append( call_groupby(
                     dstime.sel({stack_coord: c}), tgroup, bins, mode=mode)
                 )
@@ -173,7 +171,7 @@ def block_average(
                 )
                 statsls.append(cell_stats)
         results = dask.compute(statsls)
-        if point:
+        if stack_coord == 'point':
             tstast = results[0][0]
         else:
             tstats = xr.concat(results[0], dim=dstime[stack_coord])
@@ -239,17 +237,14 @@ def check_variables(dstime):
     return dstime, sw_cats, sw_temp
 
 
-def check_coordinates(dstime, point):
+def check_coordinates(dstime):
     """Check if coordinates are stacked on cell dimension.
 
     Parameters
     ----------
     dstime: xarray Dataset
          Based on intermediate dataset returned by detect(), includes
          original ts and climatologies (optional) along 'time' dimension
-        point: bool, optional
-    Set True if timeseries is a point, i.e. it has only time dimension,
-        it will skip grid parallelisation
 
 
     Returns
@@ -263,10 +258,14 @@ def check_coordinates(dstime, point):
     # find name of time dimension
     # If there is already a stacked dimension skip land_check
     check = True
-    # now that we use a stacked array without crearting an index 
+    # now that we use a stacked array without creating an index 
     # the stacked coord is a dimension without coordinates
     # and its type is int64 not anymore object
     ds_coords = list(dstime.dims)
+    # if only 1 dimension assumes is time and set stack_coords to point
+    if len(ds_coords) == 1:
+        stack_coord = 'point'
+        check = False
     for x in ds_coords:
         dtype = str(dstime[x].dtype)
         if dtype == "int64":
@@ -275,9 +274,6 @@ def check_coordinates(dstime, point):
         elif "datetime" in dtype:
             tdim = x
             print(f"Assuming {tdim} is time dimension")
-    if point:
-        stack_coord = None
-        check = False
     if check:
         dstime = land_check(dstime, tdim=tdim)
         stack_coord = "cell"

diff --git a/xmhw/xmhw.py b/xmhw/xmhw.py
@@ -48,7 +48,6 @@ def threshold(
     tstep = False,
     anynans = False,
     skipna = False,
-    point = False,
 ):
     """Calculate threshold and mean climatology (day-of-year).
 
@@ -91,9 +90,6 @@ def threshold(
     skipna: bool, optional
         If True percentile and mean function will use skipna=True.
         Using skipna option is much slower (default is False)
-    point: bool, optional
-        Set True if timeseries is a point, i.e. it has only time dimension,
-        it will skip grid parallelisation (default is False)
 
     Returns
     -------
@@ -121,6 +117,11 @@ def threshold(
             )
         }
         temp = temp.sel(**tslice)
+    # Check if there is only one dimension (assumed as time)
+    # then skip all multidimensional operations
+    dims = list(temp.dims)
+    if len(dims) == 1:
+        point = True
     # Save original attributes in dictionary to assign to final dataset
     ds_attrs = {}
     ds_attrs["ts"] = temp.attrs
@@ -317,7 +318,6 @@ def detect(
     intermediate = False,
     anynans = False,
     tstep = False,
-    point = False,
 ):
     """Applies the Hobday et al. (2016) marine heat wave definition to
     a temperature timeseries. Returns properties of all detected MHWs.
@@ -357,9 +357,6 @@ def detect(
         If True the timeseries timestep is used as base for 'doy' unit
         To use with any but 365/366 days year daily files
         (default is False)
-    point: bool, optional
-        set True if timeseries is a point, i.e. it has only time dimension,
-        it will skip grid parallelisation (default is False)
 
     Returns
     -------
@@ -376,6 +373,11 @@ def detect(
             "Maximum gap between mhw events should"
             + " be smaller than event minimum duration"
         )
+    # Check if there is only one dimension (assumed as time)
+    # then skip all multidimensional operations
+    dims = list(temp.dims)
+    if len(dims) == 1:
+        point = True
     # if time dimension different from time, rename it
     #temp = temp.rename({tdim: "time"})
     # save original attributes in a dictionary to assign to final dataset