Skip to content

Commit

Permalink
Feature/anonymize (#9)
Browse files Browse the repository at this point in the history
* add anonymize

* add anonymize

* rename to sanetise

* rename to sanetise

* annotations for pytohn 3.9

---------

Co-authored-by: Florian Pinault <Florian.Pinault@ecmwf.int>
  • Loading branch information
b8raoult and floriankrb authored Sep 30, 2024
1 parent 789714f commit 14531b9
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Keep it human-readable, your future self will thank you!
- Docsig precommit hooks
- Changelog merge strategy- Codeowners file
- Create dependency on wcwidth. MIT licence.
- Add anonimize() function.

### Changed
- downstream-ci should only runs for changes in src and tests
Expand Down
115 changes: 115 additions & 0 deletions src/anemoi/utils/sanetise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.


import os
import re
from pathlib import Path
from urllib.parse import parse_qs
from urllib.parse import urlencode
from urllib.parse import urlparse
from urllib.parse import urlunparse

# Patterns used but earthkit-data for url-patterns and path-patterns

RE1 = re.compile(r"{([^}]*)}")
RE2 = re.compile(r"\(([^}]*)\)")


def sanetise(obj):
"""sanetise an object:
- by replacing all full paths with shortened versions.
- by replacing URL passwords with '***'.
"""

if isinstance(obj, dict):
return {sanetise(k): sanetise(v) for k, v in obj.items()}

if isinstance(obj, list):
return [sanetise(v) for v in obj]

if isinstance(obj, tuple):
return tuple(sanetise(v) for v in obj)

if isinstance(obj, str):
return _sanetise_string(obj)

return obj


def _sanetise_string(obj):

parsed = urlparse(obj, allow_fragments=True)

if parsed.scheme:
return _sanetise_url(parsed)

if obj.startswith("/") or obj.startswith("~"):
return _sanetise_path(obj)

return obj


def _sanetise_url(parsed):

LIST = [
"pass",
"password",
"token",
"user",
"key",
"pwd",
"_key",
"_token",
"apikey",
"api_key",
"api_token",
"_api_token",
"_api_key",
"username",
"login",
]

scheme, netloc, path, params, query, fragment = parsed

if parsed.password or parsed.username:
_, host = netloc.split("@")
user = "user:***" if parsed.password else "user"
netloc = f"{user}@{host}"

if query:
qs = parse_qs(query)
for k in LIST:
if k in qs:
qs[k] = "hidden"
query = urlencode(qs, doseq=True)

if params:
qs = parse_qs(params)
for k in LIST:
if k in qs:
qs[k] = "hidden"
params = urlencode(qs, doseq=True)

return urlunparse([scheme, netloc, path, params, query, fragment])


def _sanetise_path(path):
bits = list(reversed(Path(path).parts))
result = [bits.pop(0)]
for bit in bits:
if RE1.match(bit) or RE2.match(bit):
result.append(bit)
continue
if result[-1] == "...":
continue
result.append("...")
result = os.path.join(*reversed(result))
if bits[-1] == "/":
result = os.path.join("/", result)

return result
10 changes: 10 additions & 0 deletions src/anemoi/utils/sanetize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

from .sanetise import sanetise as sanetize

__all__ = ["sanetize"]
69 changes: 69 additions & 0 deletions tests/test_sanetise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.


from anemoi.utils.sanetise import sanetise


def test_sanetise_urls():
assert sanetise("http://johndoe:password@host:port/path") == "http://user:***@host:port/path"

assert sanetise("http://www.example.com/path?pass=secret") == "http://www.example.com/path?pass=hidden"
assert sanetise("http://www.example.com/path?password=secret") == "http://www.example.com/path?password=hidden"
assert sanetise("http://www.example.com/path?token=secret") == "http://www.example.com/path?token=hidden"
assert sanetise("http://www.example.com/path?user=secret") == "http://www.example.com/path?user=hidden"
assert sanetise("http://www.example.com/path?key=secret") == "http://www.example.com/path?key=hidden"
assert sanetise("http://www.example.com/path?pwd=secret") == "http://www.example.com/path?pwd=hidden"
assert sanetise("http://www.example.com/path?_key=secret") == "http://www.example.com/path?_key=hidden"
assert sanetise("http://www.example.com/path?_token=secret") == "http://www.example.com/path?_token=hidden"
assert sanetise("http://www.example.com/path?apikey=secret") == "http://www.example.com/path?apikey=hidden"
assert sanetise("http://www.example.com/path?api_key=secret") == "http://www.example.com/path?api_key=hidden"
assert sanetise("http://www.example.com/path?api_token=secret") == "http://www.example.com/path?api_token=hidden"
assert sanetise("http://www.example.com/path?_api_token=secret") == "http://www.example.com/path?_api_token=hidden"
assert sanetise("http://www.example.com/path?_api_key=secret") == "http://www.example.com/path?_api_key=hidden"
assert sanetise("http://www.example.com/path?username=secret") == "http://www.example.com/path?username=hidden"
assert sanetise("http://www.example.com/path?login=secret") == "http://www.example.com/path?login=hidden"

assert sanetise("http://www.example.com/path;pass=secret") == "http://www.example.com/path;pass=hidden"
assert sanetise("http://www.example.com/path;password=secret") == "http://www.example.com/path;password=hidden"
assert sanetise("http://www.example.com/path;token=secret") == "http://www.example.com/path;token=hidden"
assert sanetise("http://www.example.com/path;user=secret") == "http://www.example.com/path;user=hidden"
assert sanetise("http://www.example.com/path;key=secret") == "http://www.example.com/path;key=hidden"
assert sanetise("http://www.example.com/path;pwd=secret") == "http://www.example.com/path;pwd=hidden"
assert sanetise("http://www.example.com/path;_key=secret") == "http://www.example.com/path;_key=hidden"
assert sanetise("http://www.example.com/path;_token=secret") == "http://www.example.com/path;_token=hidden"
assert sanetise("http://www.example.com/path;apikey=secret") == "http://www.example.com/path;apikey=hidden"
assert sanetise("http://www.example.com/path;api_key=secret") == "http://www.example.com/path;api_key=hidden"
assert sanetise("http://www.example.com/path;api_token=secret") == "http://www.example.com/path;api_token=hidden"
assert sanetise("http://www.example.com/path;_api_token=secret") == "http://www.example.com/path;_api_token=hidden"
assert sanetise("http://www.example.com/path;_api_key=secret") == "http://www.example.com/path;_api_key=hidden"
assert sanetise("http://www.example.com/path;username=secret") == "http://www.example.com/path;username=hidden"
assert sanetise("http://www.example.com/path;login=secret") == "http://www.example.com/path;login=hidden"


def test_sanetise_paths():
# We want to keep earthkit-data's url and path pattern

assert sanetise("/home/johndoe/.ssh/id_rsa") == "/.../id_rsa"

assert (
sanetise("/data/model/{date:strftime(%Y)}/{date:strftime(%m)}/{date:strftime(%d)}/analysis.grib")
== "/.../{date:strftime(%Y)}/{date:strftime(%m)}/{date:strftime(%d)}/analysis.grib"
)

assert sanetise("test.grib") == "test.grib"
assert sanetise("../test.grib") == "../test.grib"
assert sanetise("./test.grib") == "./test.grib"
assert sanetise("sub/folder/test.grib") == "sub/folder/test.grib"
assert sanetise("./folder/test.grib") == "./folder/test.grib"


if __name__ == "__main__":
for name, obj in list(globals().items()):
if name.startswith("test_") and callable(obj):
print(f"Running {name}...")
obj()

0 comments on commit 14531b9

Please sign in to comment.