Skip to content

Commit

Permalink
Fix nimbus fetching mechanism for ReadTheDocs (#2001)
Browse files Browse the repository at this point in the history
### What kind of change does this PR introduce?

* Overloads the `fetch()` method of `nimbus()` to add a `UserAgent`,
thus preventing requests from ReadTheDocs from being forbidden by
GitHub.
* Fixes up the logic for fetching the `registry.txt` files and testing
data from non-`Ouranosinc/xclim-testdata` repositories that follow the
same conventions (forks, `xhydro-testdata`, etc.).

### Does this PR introduce a breaking change?

Not really. The `fetch` calls have been modified and the registry files
for non-`Ouranosinc/xclim-testdata` files are now saved to the testing
folder with the following convention:
`registry.{repo-name}.{branch-name}.txt`.

### Other information:

readthedocs/readthedocs.org#11763
  • Loading branch information
Zeitsperre authored Nov 20, 2024
2 parents 6121ec7 + 2f57a89 commit 13c4842
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 7 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@ Bug fixes
* Fixed pickling issue with ``xclim.sdba.Grouper`` and other classes for usage with `dask>=2024.11`. (:issue:`1992`, :pull:`1993`).
* Fixed an issue with ``nimbus`` that was causing URL path components to be improperly joined. (:pull:`1997`).
* `base_kws_vars` in `MBCn` is now copied inside the `adjust` function so that in-place changes do not change the dict globally. (:pull:`1999`).
* Fixed a bug in the logic of ``xclim.testing.utils.load_registry`` that impacted the ability to load a `registry.txt` from a non-default repository. (:pull:`2001`).

Internal changes
^^^^^^^^^^^^^^^^
* Changed french translations with word "pluvieux" to "avec précipitations". (:issue:`1960`, :pull:`1994`).
* In order to address 403 (forbidden) request errors when retrieving data from GitHub via ReadTheDocs, the ``nimbus`` class has been modified to use an overloaded `fetch` method that appends a User-Agent header to the request. (:pull:`2001`).

v0.53.2 (2024-10-31)
--------------------
Expand Down
59 changes: 52 additions & 7 deletions xclim/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@
import sys
import time
import warnings
from collections.abc import Sequence
from collections.abc import Callable, Sequence
from datetime import datetime as dt
from functools import wraps
from importlib import import_module
from io import StringIO
from pathlib import Path
from shutil import copytree
from typing import TextIO
from typing import IO, TextIO
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin, urlparse
from urllib.request import urlretrieve
Expand Down Expand Up @@ -435,26 +436,38 @@ def load_registry(
dict
Dictionary of filenames and hashes.
"""
if not repo.endswith("/"):
repo = f"{repo}/"
remote_registry = audit_url(
urljoin(
urljoin(repo, branch if branch.endswith("/") else f"{branch}/"),
"data/registry.txt",
)
)

if branch != default_testdata_version:
if repo != default_testdata_repo_url:
external_repo_name = urlparse(repo).path.split("/")[-2]
external_branch_name = branch.split("/")[-1]
registry_file = Path(
str(
ilr.files("xclim").joinpath(
f"testing/registry.{external_repo_name}.{external_branch_name}.txt"
)
)
)
urlretrieve(remote_registry, registry_file) # noqa: S310

elif branch != default_testdata_version:
custom_registry_folder = Path(
str(ilr.files("xclim").joinpath(f"testing/{branch}"))
)
custom_registry_folder.mkdir(parents=True, exist_ok=True)
registry_file = custom_registry_folder.joinpath("registry.txt")
urlretrieve(remote_registry, registry_file) # noqa: S310

elif repo != default_testdata_repo_url:
else:
registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt")))
urlretrieve(remote_registry, registry_file) # noqa: S310

registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt")))
if not registry_file.exists():
raise FileNotFoundError(f"Registry file not found: {registry_file}")

Expand Down Expand Up @@ -516,10 +529,13 @@ def nimbus( # noqa: PR01
"The `pooch` package is required to fetch the xclim testing data. "
"You can install it with `pip install pooch` or `pip install xclim[dev]`."
)
if not repo.endswith("/"):
repo = f"{repo}/"
remote = audit_url(
urljoin(urljoin(repo, branch if branch.endswith("/") else f"{branch}/"), "data")
)
return pooch.create(

_nimbus = pooch.create(
path=cache_dir,
base_url=remote,
version=default_testdata_version,
Expand All @@ -528,6 +544,35 @@ def nimbus( # noqa: PR01
registry=load_registry(branch=branch, repo=repo),
)

# Add a custom fetch method to the Pooch instance
# Needed to address: https://github.com/readthedocs/readthedocs.org/issues/11763
# Fix inspired by @bjlittle (https://github.com/bjlittle/geovista/pull/1202)
_nimbus.fetch_diversion = _nimbus.fetch

# Overload the fetch method to add user-agent headers
@wraps(_nimbus.fetch_diversion)
def _fetch(*args: str, **kwargs: bool | Callable) -> str: # numpydoc ignore=GL08

def _downloader(
url: str,
output_file: str | IO,
poocher: pooch.Pooch,
check_only: bool | None = False,
) -> None:
"""Download the file from the URL and save it to the save_path."""
headers = {"User-Agent": f"xclim ({__xclim_version__})"}
downloader = pooch.HTTPDownloader(headers=headers)
return downloader(url, output_file, poocher, check_only=check_only)

# default to our http/s downloader with user-agent headers
kwargs.setdefault("downloader", _downloader)
return _nimbus.fetch_diversion(*args, **kwargs)

# Replace the fetch method with the custom fetch method
_nimbus.fetch = _fetch

return _nimbus


# idea copied from raven that it borrowed from xclim that borrowed it from xarray that was borrowed from Seaborn
def open_dataset(
Expand Down

0 comments on commit 13c4842

Please sign in to comment.