Skip to content

Commit

Permalink
Merge pull request #60 from coecms/point_workflow
Browse files Browse the repository at this point in the history
Point workflow
  • Loading branch information
paolap authored Feb 20, 2023
2 parents afdcb22 + 1bdc700 commit 91c0a4d
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 46 deletions.
4 changes: 2 additions & 2 deletions conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% set version = "1.0.0" %}
{% set version = "0.9.2" %}
package:
name: xmhw
version: {{ version }}
Expand Down Expand Up @@ -30,7 +30,7 @@ requirements:

source:
git_url: https://github.com/coecms/xmhw.git
git_rev: 1.0.0
git_rev: 0.9.2

build:
script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed"
Expand Down
20 changes: 3 additions & 17 deletions docs/xmhw_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3324,7 +3324,7 @@
"````\n",
" threshold(temp, tdim='time', climatologyPeriod=[None,None], pctile=90, windowHalfWidth=5, \n",
" smoothPercentile=True, smoothPercentileWidth=31, maxPadLength=None, \n",
" coldSpells=False, Ly=False, anynans=False, skipna=False, point=False):\n",
" coldSpells=False, Ly=False, anynans=False, skipna=False):\n",
"````\n",
"\n",
"Where *temp* is the temperature timeseries, this is the only input needed. Arguments names are the same as the original MarineHeatWave code, where possible:<br>\n",
Expand Down Expand Up @@ -3368,9 +3368,6 @@
" * **skipna**: bool, optional \n",
" If True percentile and mean function will use skipna=True.\n",
" Using skipna option is much slower (default is False)<br>\n",
" * **point**: bool, optional\n",
" Set True if timeseries is a point, i.e. it has only time dimension,\n",
" it will skip grid parallelisation (default is False)\n",
"\n",
"More on missing values later."
]
Expand Down Expand Up @@ -3430,18 +3427,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"If working with a single-point timeseries, there is no need to check for land points or parallelise the workflow, hence from version 1.0 we introduced a new flag `point` to indicate that the timeseries has only a time dimension. Point by default is False."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clim = threshold(sst_single_point, point=True)\n",
"mhw = detect(sst_single_point, clim['thresh'], clim['seas'], point=True)\n",
"block = block_average(mhw, point=True)"
"If working with a single-point timeseries, there is no need to check for land points or parallelise the workflow, hence from version 0.9.2 the code will check for the number of dimensions and when this is equal to one, it assumes it is the time dimension and skip all grid operations, including land_check."
]
},
{
Expand Down Expand Up @@ -3477,7 +3463,7 @@
"````\n",
" def detect(temp, th, se, tdim='time', minDuration=5, joinGaps=True, \n",
" maxGap=2, maxPadLength=None, coldSpells=False,\n",
" intermediate=False, anynans=False, point=False):\n",
" intermediate=False, anynans=False):\n",
"\n",
"````\n",
"Apart from the timeseries, the threshold and the seasonal average, the other arguments are all optional.<br>\n",
Expand Down
4 changes: 2 additions & 2 deletions test/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ def test_check_variables(inter_data):
def test_check_coordinates(inter_data):
# with introduction of single point timeseries skip this particular test doesn't work as it was
#inter_stack = land_check(inter_data, tdim="index")
outds, coord = check_coordinates(inter_data, True)
outds, coord = check_coordinates(inter_data)
#xrtest.assert_equal(inter_stack, outds)
assert coord == None
assert coord == 'point'
#stacked = inter_stack.rename({"cell": "other"})
#outds, coord = check_coordinates(stacked)
#xrtest.assert_equal(stacked, outds)
Expand Down
30 changes: 13 additions & 17 deletions xmhw/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def block_average(
mtime="time_start",
removeMissing=False,
split=False,
point=False,
):
"""Calculate statistics like averages, mean and maximum on blocks of years.
Expand Down Expand Up @@ -66,9 +65,6 @@ def block_average(
(default is False)
split: bool, optional
(default is False)
point: bool, optional
Set True if timeseries is a point, i.e. it has only time dimension,
it will skip grid parallelisation (default is False)
Returns
-------
Expand Down Expand Up @@ -101,7 +97,9 @@ def block_average(
# is already present, then we do not need clims
if dstime is not None:
dstime, sw_cats, sw_temp = check_variables(dstime)
dstime, stack_coord = check_coordinates(dstime, point)
dstime, stack_coord = check_coordinates(dstime)
if stack_coord is None:
point = True
period = [
dstime.time.dt.year[0].values,
dstime.time.dt.year[-1].values,
Expand Down Expand Up @@ -136,15 +134,15 @@ def block_average(
blockls = []
# this defines years array to use to groupby arrays
tgroup = mhw[mtime].dt.year
if point:
if stack_coord == 'point':
blockls.append(call_groupby(mhw, tgroup, bins))
else:
# remove land and stack on cell
mhw = land_check(mhw, tdim="events")
for c in mhw.cell:
blockls.append(call_groupby(mhw.sel(cell=c), tgroup, bins))
results = dask.compute(blockls)
if point:
if stack_coord == 'point':
block = blockls[0]
else:
block = xr.concat(results[0], dim=mhw.cell).unstack("cell")
Expand All @@ -162,7 +160,7 @@ def block_average(
#tgroup = dstime.isel({stack_coord: 0}).time.dt.year
tgroup = dstime.time.dt.year
statsls = []
if point:
if stack_coord == 'point':
statsls.append( call_groupby(
dstime.sel({stack_coord: c}), tgroup, bins, mode=mode)
)
Expand All @@ -173,7 +171,7 @@ def block_average(
)
statsls.append(cell_stats)
results = dask.compute(statsls)
if point:
if stack_coord == 'point':
tstast = results[0][0]
else:
tstats = xr.concat(results[0], dim=dstime[stack_coord])
Expand Down Expand Up @@ -239,17 +237,14 @@ def check_variables(dstime):
return dstime, sw_cats, sw_temp


def check_coordinates(dstime, point):
def check_coordinates(dstime):
"""Check if coordinates are stacked on cell dimension.
Parameters
----------
dstime: xarray Dataset
Based on intermediate dataset returned by detect(), includes
original ts and climatologies (optional) along 'time' dimension
point: bool, optional
Set True if timeseries is a point, i.e. it has only time dimension,
it will skip grid parallelisation
Returns
Expand All @@ -263,10 +258,14 @@ def check_coordinates(dstime, point):
# find name of time dimension
# If there is already a stacked dimension skip land_check
check = True
# now that we use a stacked array without crearting an index
# now that we use a stacked array without creating an index
# the stacked coord is a dimension without coordinates
# and its type is int64 not anymore object
ds_coords = list(dstime.dims)
# if only 1 dimension assumes is time and set stack_coords to point
if len(ds_coords) == 1:
stack_coord = 'point'
check = False
for x in ds_coords:
dtype = str(dstime[x].dtype)
if dtype == "int64":
Expand All @@ -275,9 +274,6 @@ def check_coordinates(dstime, point):
elif "datetime" in dtype:
tdim = x
print(f"Assuming {tdim} is time dimension")
if point:
stack_coord = None
check = False
if check:
dstime = land_check(dstime, tdim=tdim)
stack_coord = "cell"
Expand Down
18 changes: 10 additions & 8 deletions xmhw/xmhw.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def threshold(
tstep = False,
anynans = False,
skipna = False,
point = False,
):
"""Calculate threshold and mean climatology (day-of-year).
Expand Down Expand Up @@ -91,9 +90,6 @@ def threshold(
skipna: bool, optional
If True percentile and mean function will use skipna=True.
Using skipna option is much slower (default is False)
point: bool, optional
Set True if timeseries is a point, i.e. it has only time dimension,
it will skip grid parallelisation (default is False)
Returns
-------
Expand Down Expand Up @@ -121,6 +117,11 @@ def threshold(
)
}
temp = temp.sel(**tslice)
# Check if there is only one dimension (assumed as time)
# then skip all multidimensional operations
dims = list(temp.dims)
if len(dims) == 1:
point = True
# Save original attributes in dictionary to assign to final dataset
ds_attrs = {}
ds_attrs["ts"] = temp.attrs
Expand Down Expand Up @@ -317,7 +318,6 @@ def detect(
intermediate = False,
anynans = False,
tstep = False,
point = False,
):
"""Applies the Hobday et al. (2016) marine heat wave definition to
a temperature timeseries. Returns properties of all detected MHWs.
Expand Down Expand Up @@ -357,9 +357,6 @@ def detect(
If True the timeseries timestep is used as base for 'doy' unit
To use with any but 365/366 days year daily files
(default is False)
point: bool, optional
set True if timeseries is a point, i.e. it has only time dimension,
it will skip grid parallelisation (default is False)
Returns
-------
Expand All @@ -376,6 +373,11 @@ def detect(
"Maximum gap between mhw events should"
+ " be smaller than event minimum duration"
)
# Check if there is only one dimension (assumed as time)
# then skip all multidimensional operations
dims = list(temp.dims)
if len(dims) == 1:
point = True
# if time dimension different from time, rename it
#temp = temp.rename({tdim: "time"})
# save original attributes in a dictionary to assign to final dataset
Expand Down

0 comments on commit 91c0a4d

Please sign in to comment.