diff --git a/docs/conf.py b/docs/conf.py index bacd2eb..30b10df 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -57,6 +57,8 @@ "sphinx.ext.todo", "sphinx_rtd_theme", "nbsphinx", + "sphinx.ext.graphviz", + "sphinx.ext.intersphinx", ] # Add any paths that contain templates here, relative to this directory. @@ -67,6 +69,10 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "'**.ipynb_checkpoints'"] +intersphinx_mapping = { + "python": ("https://python.readthedocs.io/en/latest", None), +} + # https://www.notion.so/Deepnote-Launch-Buttons-63c642a5e875463495ed2341e83a4b2a diff --git a/docs/index.rst b/docs/index.rst index 57cf9e3..8c093f5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -42,6 +42,13 @@ datasets `. **Using training datasets** - :doc:`using/introduction` +- :doc:`using/opening` +- :doc:`using/subsetting` +- :doc:`using/combining` +- :doc:`using/selecting` +- :doc:`using/grids` +- :doc:`using/statistics` +- :doc:`using/other` - :doc:`using/options` .. toctree:: @@ -50,6 +57,13 @@ datasets `. :caption: Using datasets using/introduction + using/opening + using/subsetting + using/combining + using/selecting + using/grids + using/statistics + using/other using/options **Building training datasets** diff --git a/docs/using/combining.rst b/docs/using/combining.rst new file mode 100644 index 0000000..ed3d0ed --- /dev/null +++ b/docs/using/combining.rst @@ -0,0 +1,81 @@ +.. _combining-datasets: + +#################### + Combining datasets +#################### + +When combining datasets, the statistics of the first dataset are used by +default. You can change this by setting the :ref:`selecting-statistics` +option to a different dataset, even if it is not part of the +combination. See + +.. _concat: + +******** + concat +******** + +You can concatenate two or more datasets along the dates dimension. The +package will check that all datasets are compatible (same resolution, +same variables, etc.). Currently, the datasets must be given in +chronological order with no gaps between them. + +.. code:: python + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1940-1978-1h-v2", + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + ) + +.. image:: concat.png + :alt: Concatenation + +Please note that you can pass more than two ``zarr`` files to the +function. + + **NOTE:** When concatenating file, the statistics are not recomputed; + it is the statistics of first file that are returned to the user. + +****** + join +****** + +You can join two datasets that have the same dates, combining their +variables. + +.. code:: python + + from ecml_tools.data import open_dataset + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + "some-extra-parameters-from-another-source-o96-1979-2022-1h-v2", + ) + +.. image:: join.png + :alt: Join + +If a variable is present in more that one file, that last occurrence of +that variable will be used, and will be at the position of the first +occurrence of that name. + +.. image:: overlay.png + :alt: Overlay + +Please note that you can join more than two ``zarr`` files. + +*********** + ensembles +*********** + +.. code:: python + + open_dataset(ensembles=[dataset1, dataset2, ...]) + +******* + grids +******* + +.. code:: python + + open_dataset(grids=[dataset1, dataset2, ...], method=...) diff --git a/docs/using/grids.rst b/docs/using/grids.rst new file mode 100644 index 0000000..534320f --- /dev/null +++ b/docs/using/grids.rst @@ -0,0 +1,17 @@ +.. _selecting-grids: + +####################### + Selecting grid points +####################### + +********** + thinning +********** + +.. code:: python + + open_dataset(dataset, thinning=..., method="every-nth") + +****** + area +****** diff --git a/docs/using/opening.rst b/docs/using/opening.rst new file mode 100644 index 0000000..9934728 --- /dev/null +++ b/docs/using/opening.rst @@ -0,0 +1,52 @@ +.. _opening-datasets: + +################## + Opening datasets +################## + +.. code:: python + + from anemoi_datasets import open_dataset + + ds = open_dataset("path/to/dataset.zarr", option1=value1, option2=value2, ...) + +or + +.. code:: python + + from anemoi_datasets import open_dataset + + ds = open_dataset(combine=["path/to/dataset1.zarr", + "path/to/dataset2.zarr", ...]) + +or + +.. code:: python + + from anemoi_datasets import open_dataset + + ds = open_dataset(combine=["path/to/dataset1.zarr", + "path/to/dataset2.zarr", ...], + option1=value1, option2=value2, ...) + +The term `combine` is one of `join`, `concat`, `ensembles`, etc. See +:ref:`combining-datasets` for more information. + +.. note:: + + The options `option1`, `option2`, apply to the combined dataset. + +.. code:: python + + from anemoi_datasets import open_dataset + + ds = open_dataset(combine=[{"dataset": "path/to/dataset1.zarr", + "option1"=value1, "option2"=value2, ...}, + {"dataset": "path/to/dataset2.zarr", + "option3"=value3, "option4"=value4, ...}, + ...]) + +.. note:: + + The options `option1`, `option2`, apply to the first dataset, and + `option3`, `option4`, to the second dataset, etc. diff --git a/docs/using/options.rst b/docs/using/options.rst index 7659de8..e5953ea 100644 --- a/docs/using/options.rst +++ b/docs/using/options.rst @@ -71,7 +71,8 @@ dataset ======= This is a path or URL to a ``zarr`` file that has been created with this -package, as described in :ref:`datasets-building`. +package, as described in :ref:`Building training datasets +`. .. code:: python @@ -94,199 +95,3 @@ Alternatively, you can pass an already opened dataset: ds1 = open_dataset("aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2") ds2 = open_dataset(ds1, start=1979, end=2020) - -start -===== - -This option let you subset the dataset by time. You can pass a date or a - -.. code:: python - - open_dataset(dataset, start=1980) - -end -=== - -As for the start option, you can pass a date or a string: - -.. code:: python - - open_dataset(dataset, end="2020-12-31") - -The following are equivalent way of describing ``start`` or ``end``: - -- ``2020`` and ``"2020"`` -- ``202306``, ``"202306"`` and ``"2023-06"`` -- ``20200301``, ``"20200301"`` and ``"2020-03-01"`` - -frequency -========= - -You can change the frequency of the dataset by passing a string with the - -.. code:: python - - ds = open_dataset("aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", frequency="6h") - -select -====== - -.. code:: python - - # Select '2t' and 'tp' in that order - - ds = open_dataset( - "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", - select=["2t", "tp"], - ) - -.. code:: python - - # Select '2t' and 'tp', but preserve the order in which they are in the dataset - - ds = open_dataset( - "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", - select={"2t", "tp"}, - ) - -drop -==== - -You can also drop some variables: - -.. code:: python - - ds = open_dataset( - "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", - drop=["10u", "10v"], - ) - -reorder -======= - -and reorder them: - -... using a list - -.. code:: python - - ds = open_dataset( - "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", - reorder=["2t", "msl", "sp", "10u", "10v"], - ) - -... or using a dictionary - -.. code:: python - - ds = open_dataset( - "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", - reorder={"2t": 0, "msl": 1, "sp": 2, "10u": 3, "10v": 4}, - ) - -rename -====== - -You can also rename variables: - -.. code:: python - - ds = open_dataset( - "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", - rename={"2t": "t2m"}, - ) - -This will be useful when your join datasets and do not want variables -from one dataset to override the ones from the other. - -.. _statistics: - -statistics -========== - -.. code:: python - - open_dataset(dataset, statistics=other_dataset) - -thinning -======== - -.. code:: python - - open_dataset(dataset, thinning=..., method="every-nth") - -area -==== - -******************** - Combining datasets -******************** - -When combining datasets, the statistics of the first dataset are used by -default. You can change this by setting the :ref:`statistics` option to -a different dataset, even if it is not part of the combination. See - -concat -====== - -You can concatenate two or more datasets along the dates dimension. The -package will check that all datasets are compatible (same resolution, -same variables, etc.). Currently, the datasets must be given in -chronological order with no gaps between them. - -.. code:: python - - ds = open_dataset( - "aifs-ea-an-oper-0001-mars-o96-1940-1978-1h-v2", - "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", - ) - -.. image:: concat.png - :alt: Concatenation - -Please note that you can pass more than two ``zarr`` files to the -function. - - **NOTE:** When concatenating file, the statistics are not recomputed; - it is the statistics of first file that are returned to the user. - -join -==== - -You can join two datasets that have the same dates, combining their -variables. - -.. code:: python - - from ecml_tools.data import open_dataset - - ds = open_dataset( - "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", - "some-extra-parameters-from-another-source-o96-1979-2022-1h-v2", - ) - -.. image:: join.png - :alt: Join - -If a variable is present in more that one file, that last occurrence of -that variable will be used, and will be at the position of the first -occurrence of that name. - -.. image:: overlay.png - :alt: Overlay - -Please note that you can join more than two ``zarr`` files. - -ensembles -========= - -.. code:: python - - open_dataset(ensembles=[dataset1, dataset2, ...]) - -grids -===== - -.. code:: python - - open_dataset(grids=[dataset1, dataset2, ...], method=...) diff --git a/docs/using/other.rst b/docs/using/other.rst new file mode 100644 index 0000000..3f45607 --- /dev/null +++ b/docs/using/other.rst @@ -0,0 +1,53 @@ +.. _selecting-other: + +################## + Other operations +################## + +.. warning:: The operations described in this section are do not check that their inputs are compatible. + + +***** + zip +***** + +The `zip` operation is used to combine multiple datasets into a single dataset. + +.. code:: python + + ds = open_dataset(zip=[dataset1, dataset2, ...]) + + # This will return tuples + + print(ds[0]) + + print(ds[3, 4]) + + + +This operation is identical to the Python's :py:func:`zip` function. + +******* + chain +******* + +.. code:: python + + ds = open_dataset(chain=[dataset1, dataset2, ...]) + + +The `chain` operation is used to combine multiple datasets into a single dataset. +The datasets are combined by concatenating the data arrays along the first dimension (dates). +This is similar to the :ref:`concat` operation, but no check are done to see if the datasets are compatible, +this means that the shape of the arrays returned when iterating or indexing may be different. + +This operation is identical to the Python's :py:func:`itertools.chain` function. + + +******** + shuffle +******** + +.. code:: python + + ds = open_dataset(dataset, shuffle=True) diff --git a/docs/using/selecting.rst b/docs/using/selecting.rst new file mode 100644 index 0000000..509b45d --- /dev/null +++ b/docs/using/selecting.rst @@ -0,0 +1,80 @@ +.. _selecting-variables: + +##################### + Selecting variables +##################### + +******** + select +******** + +.. code:: python + + # Select '2t' and 'tp' in that order + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + select=["2t", "tp"], + ) + +.. code:: python + + # Select '2t' and 'tp', but preserve the order in which they are in the dataset + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + select={"2t", "tp"}, + ) + +****** + drop +****** + +You can also drop some variables: + +.. code:: python + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + drop=["10u", "10v"], + ) + +********* + reorder +********* + +and reorder them: + +... using a list + +.. code:: python + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + reorder=["2t", "msl", "sp", "10u", "10v"], + ) + +... or using a dictionary + +.. code:: python + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + reorder={"2t": 0, "msl": 1, "sp": 2, "10u": 3, "10v": 4}, + ) + +******** + rename +******** + +You can also rename variables: + +.. code:: python + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + rename={"2t": "t2m"}, + ) + +This will be useful when your join datasets and do not want variables +from one dataset to override the ones from the other. diff --git a/docs/using/statistics.rst b/docs/using/statistics.rst new file mode 100644 index 0000000..e59f117 --- /dev/null +++ b/docs/using/statistics.rst @@ -0,0 +1,13 @@ +.. _selecting-statistics: + +############ + Statistics +############ + +************ + statistics +************ + +.. code:: python + + open_dataset(dataset, statistics=other_dataset) diff --git a/docs/using/subsetting.rst b/docs/using/subsetting.rst new file mode 100644 index 0000000..137ab24 --- /dev/null +++ b/docs/using/subsetting.rst @@ -0,0 +1,41 @@ +.. _subsetting-datasets: + +##################### + Subsetting datasets +##################### + +******* + start +******* + +This option let you subset the dataset by time. You can pass a date or a + +.. code:: python + + open_dataset(dataset, start=1980) + +***** + end +***** + +As for the start option, you can pass a date or a string: + +.. code:: python + + open_dataset(dataset, end="2020-12-31") + +The following are equivalent way of describing ``start`` or ``end``: + +- ``2020`` and ``"2020"`` +- ``202306``, ``"202306"`` and ``"2023-06"`` +- ``20200301``, ``"20200301"`` and ``"2020-03-01"`` + +*********** + frequency +*********** + +You can change the frequency of the dataset by passing a string with the + +.. code:: python + + ds = open_dataset("aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", frequency="6h")