Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement relative altitude filter #42

Merged
merged 35 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
0f2427b
Boilerplate
thorbjoernl Sep 4, 2024
461f1fc
Progress on implementing relative altitude filter
thorbjoernl Sep 4, 2024
da22b65
fix dependencies
thorbjoernl Sep 4, 2024
0c02e39
Add tests and test topography data
thorbjoernl Sep 5, 2024
41efda6
.
thorbjoernl Sep 5, 2024
8d1379f
.
thorbjoernl Sep 5, 2024
155cf38
CR1
thorbjoernl Sep 5, 2024
45fc98e
CR2
thorbjoernl Sep 5, 2024
cdef4c1
Remove netcdf as explicit dependency
thorbjoernl Sep 5, 2024
3cc7d6e
WIP properly read coords and determine type
thorbjoernl Sep 5, 2024
b77ab63
.
thorbjoernl Sep 5, 2024
c55c3ad
.
thorbjoernl Sep 5, 2024
72cc4a3
.
thorbjoernl Sep 5, 2024
926a6f0
Add cfunits as dependency
thorbjoernl Sep 5, 2024
d1627da
Install libudunits2-dev in ci
thorbjoernl Sep 5, 2024
3b45cb8
.
thorbjoernl Sep 5, 2024
83ca967
.
thorbjoernl Sep 5, 2024
dc3c47f
.
thorbjoernl Sep 5, 2024
9a459d9
Add second test data file
thorbjoernl Sep 5, 2024
709e9bf
.
thorbjoernl Sep 5, 2024
405d62c
Tests green locally
thorbjoernl Sep 5, 2024
fb86b1e
Numpy 2 breaks fix
thorbjoernl Sep 5, 2024
da9b97f
Remove code comments + model->gridded
thorbjoernl Sep 5, 2024
102a5c5
.
thorbjoernl Sep 5, 2024
0e7a6f2
.
thorbjoernl Sep 5, 2024
b79c5c2
Make dependencies optional
thorbjoernl Sep 6, 2024
6717392
Oopsie
thorbjoernl Sep 6, 2024
e557496
.
thorbjoernl Sep 6, 2024
f79683c
.
thorbjoernl Sep 6, 2024
0b5d8ed
.
thorbjoernl Sep 6, 2024
cd1dfc9
...
thorbjoernl Sep 6, 2024
b3742f4
Exclude stations outside grid bounding bocks
thorbjoernl Sep 6, 2024
600865a
Add benchmark
thorbjoernl Sep 6, 2024
82df240
Optimized code: benchmark from 27.133s->0.067s
thorbjoernl Sep 6, 2024
a69a48d
Preallocate numpy arrays
thorbjoernl Sep 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/ci_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install system packages
run: |
sudo apt update
sudo apt install libudunits2-dev
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
50 changes: 50 additions & 0 deletions scripts/benchmark_relalt_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pyaro
import csv
import time
import random
import os

# Generate test-data
if not os.path.exists("tmp_data.csv"):
with open("tmp_data.csv", "w", newline='') as csvfile:
writer = csv.writer(csvfile)
variable = "NOx"
unit = "Gg"
for i in range(75000):
name = f"station{i}"
lat = random.uniform(30.05, 81.95)
lon = random.uniform(-29.95, 89.95)
value = random.uniform(0, 1)
altitude = random.randrange(0, 1000)
start_time = "1997-01-01 00:00:00"
end_time = "1997-01-02 00:00:00"


writer.writerow((variable, name, lon, lat, value, unit, start_time, end_time, altitude))

# Benchmark
engines = pyaro.list_timeseries_engines()
with engines["csv_timeseries"].open(
filename="tmp_data.csv",
#filters=[pyaro.timeseries.filters.get("altitude", min_altitude=200)], # 0.023s
filters=[pyaro.timeseries.filters.get("relaltitude", topo_file = "../tests/testdata/datadir_elevation/topography.nc", rdiff=90)], # 27.133s
columns={
"variable": 0,
"station": 1,
"longitude": 2,
"latitude": 3,
"value": 4,
"units": 5,
"start_time": 6,
"end_time": 7,
"altitude": 8,
"country": "NO",
"standard_deviation": "NaN",
"flag": "0",
}) as ts:
start_time = time.perf_counter()
ts.stations()
end_time = time.perf_counter()


print(f"Total time: {end_time-start_time:.3f} seconds")
14 changes: 14 additions & 0 deletions scripts/make_topography_subset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import xarray as xr

# Script which extracts only the first time point and topography value from the emep topography
# file to reduce data for the tests.
data = xr.open_dataset("/lustre/storeB/project/fou/kl/emep/Auxiliary/topography.nc")

start_time = data["time"][0]

data = data.sel(time=slice(start_time))

data = data["topography"]


data.to_netcdf("tests/testdata/datadir_elevation/topography.nc")
15 changes: 11 additions & 4 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,31 @@ url = https://pyaro.readthedocs.io
[options]
python_version = >=3.10
install_requires =
numpy >= 1.13
numpy >= 1.13, <2.0.0
thorbjoernl marked this conversation as resolved.
Show resolved Hide resolved
importlib-metadata >= 3.6; python_version < "3.10"
package_dir =
=src
packages = pyaro, pyaro.timeseries, pyaro.csvreader
test_require = tox:tox
test_require =
tox:tox

[options.extras_require]
optional =
pandas
cf-units
xarray
netcdf4

[tox:tox]
min_version = 4.0
env_list =
py312
py311
py310
depends =
pandas

[testenv]
commands = python3 -m unittest discover -s tests
extras = optional


[options.entry_points]
Expand Down
2 changes: 1 addition & 1 deletion src/pyaro/csvreader/CSVTimeseriesReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def col_keys(cls):
"""
return cls._col_keys

def metadata(self) -> dict():
def metadata(self) -> dict:
return self._metadata

def _unfiltered_data(self, varname) -> Data:
Expand Down
2 changes: 1 addition & 1 deletion src/pyaro/timeseries/AutoFilterReaderEngine.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def supported_filters(cls) -> list[Filter]:

:return: list of filters
"""
supported = "variables,stations,countries,bounding_boxes,duplicates,time_bounds,time_resolution,flags,time_variable_station,altitude".split(
supported = "variables,stations,countries,bounding_boxes,duplicates,time_bounds,time_resolution,flags,time_variable_station,altitude,relaltitude".split(
","
)
return [filters.get(name) for name in supported]
Expand Down
166 changes: 164 additions & 2 deletions src/pyaro/timeseries/Filter.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
import logging
import math
import abc
from collections import defaultdict
import csv
from datetime import datetime
import inspect
import re
import sys
import types
from typing import Any

import numpy as np

from .Data import Data, Flag
from .Station import Station

try:
# Optional dependencies required for relative altitude filter.
import xarray as xr
from cf_units import Unit
except ImportError:
pass

logger = logging.getLogger(__name__)

class Filter(abc.ABC):
"""Base-class for all filters used from pyaro-Readers"""
Expand Down Expand Up @@ -855,4 +864,157 @@ def filter_stations(self, stations: dict[str, Station]) -> dict[str, Station]:
if self._max_altitude is not None:
stations = {n: s for n, s in stations.items() if (not math.isnan(s["altitude"]) and s["altitude"] <= self._max_altitude) }

return stations
return stations

@registered_filter
class RelativeAltitudeFilter(StationFilter):
"""
Filter class which filters stations based on the relative difference between
the station altitude, and the gridded topography altitude.

thorbjoernl marked this conversation as resolved.
Show resolved Hide resolved
https://github.com/metno/pyaro/issues/39
"""
UNITS_METER = Unit("m")
# https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#latitude-coordinate
UNITS_LAT = set(["degrees_north", "degree_north", "degree_N", "degrees_N", "degreeN", "degreesN"])

# https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#longitude-coordinate
UNITS_LON = set(["degrees_east", "degree_east", "degree_E", "degrees_E", "degreeE", "degreesE"])

def __init__(self, topo_file: str | None = None, topo_var: str = "topography", rdiff: float = 0):
"""
:param topo_file : A .nc file from which to read gridded topography data.
:param topo_var : Name of variable that stores altitude.
:param rdiff : Relative difference (in meters).

Note:
-----
- Stations will be kept if abs(altobs-altmod) <= rdiff.
- Stations will not be kept if station altitude is NaN.
thorbjoernl marked this conversation as resolved.
Show resolved Hide resolved

Note:
-----
This filter requires additional dependencies (xarray, netcdf4, cf-units) to function. These can be installed
with `pip install .[optional]
"""
thorbjoernl marked this conversation as resolved.
Show resolved Hide resolved
if "cf_units" not in sys.modules:
logger.warning("relaltitude filter is missing required dependency 'cf-units'. Please install to use this filter.")
if "xarray" not in sys.modules:
logger.warning("relaltitude filter is missing required dependency 'xarray'. Please install to use this filter.")

self._topo_file = topo_file
self._topo_var = topo_var
self._rdiff = rdiff

self._topography = None
if topo_file is not None:
self._topography = xr.open_dataset(topo_file)
self._convert_altitude_to_meters()
self._find_lat_lon_variables()
self._extract_bounding_box()
else:
logger.warning("No topography data provided (topo_file='%s'). Relative elevation filtering will not be applied.", topo_file)

def _convert_altitude_to_meters(self):
"""
Method which attempts to convert the altitude variable in the gridded topography data
to meters.

:raises TypeError
If conversion isn't possible.
"""
# Convert altitude to meters
units = Unit(self._topography[self._topo_var].units)
if units.is_convertible(self.UNITS_METER):
self._topography[self._topo_var].values = self.UNITS_METER.convert(self._topography[self._topo_var].values, self.UNITS_METER)
self._topography[self._topo_var]["units"] = str(self.UNITS_METER)
else:
raise TypeError(f"Expected altitude units to be convertible to 'm', got '{units}'")

def _find_lat_lon_variables(self):
"""
Determines the names of variables which represent the latitude and longitude
dimensions in the topography data.

These are assigned to self._lat, self._lon, respectively for later use.
"""
for var_name in self._topography.coords:
unit_str = self._topography[var_name].attrs.get("units", None)
if unit_str in self.UNITS_LAT:
self._lat = var_name
continue
if unit_str in self.UNITS_LON:
self._lon = var_name
continue

if any(x is None for x in [self._lat, self._lon]):
raise ValueError(f"Required variable names for lat, lon dimensions could not be found in file '{self._topo_file}")

def _extract_bounding_box(self):
"""
Extract the bounding box of the grid.
"""
self._boundary_west = float(self._topography[self._lon].min())
self._boundary_east = float(self._topography[self._lon].max())
self._boundary_south = float(self._topography[self._lat].min())
self._boundary_north = float(self._topography[self._lat].max())
logger.info("Bounding box (NESW): %.2f, %.2f, %.2f, %.2f", self._boundary_north, self._boundary_east, self._boundary_south, self._boundary_west)

def _gridded_altitude_from_lat_lon(self, lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
altitude = self._topography.sel({"lat": xr.DataArray(lat, dims="latlon"), "lon": xr.DataArray(lon, dims="latlon")}, method="nearest")

return altitude[self._topo_var].values[0]

def _is_close(self, alt_gridded: np.ndarray, alt_station: np.ndarray) -> np.ndarray[bool]:
"""
Function to check if two altitudes are within a relative tolerance of each
other.

:param alt_gridded : Gridded altitude (in meters).
:param alt_station : Observation / station altitude (in meters).

:returns :
True if the absolute difference between alt_gridded and alt_station is
<= self._rdiff
"""
return np.abs(alt_gridded-alt_station) <= self._rdiff

def init_kwargs(self):
return {
"topo_file": self._topo_file,
"topo_var": self._topo_var,
"rdiff": self._rdiff
}

def name(self):
return "relaltitude"

def filter_stations(self, stations: dict[str, Station]) -> dict[str, Station]:
if self._topography is None:
return stations

names = np.ndarray(len(stations), dtype=np.dtypes.StrDType)
lats = np.ndarray(len(stations), dtype=np.float64)
lons = np.ndarray(len(stations), dtype=np.float64)
alts = np.ndarray(len(stations), dtype=np.float64)

for i, name in enumerate(stations):
station = stations[name]
names[i] = name
lats[i] = station["latitude"]
lons[i] = station["longitude"]
alts[i] = station["altitude"]

out_of_bounds_mask = np.logical_or(np.logical_or(lons < self._boundary_west, lons > self._boundary_east), np.logical_or(lats < self._boundary_south, lats > self._boundary_north))
if np.sum(out_of_bounds_mask) > 0:
logger.warning("Some stations were removed due to being out of bounds of the gridded topography")

topo = self._gridded_altitude_from_lat_lon(lats, lons)

within_rdiff_mask = self._is_close(topo, alts)

mask = np.logical_and(~out_of_bounds_mask, within_rdiff_mask)

selected_names = names[mask]

return {name: stations[name] for name in selected_names}
Loading