From e2d80105e9ebaa0ff20039f6117989811d0ea15c Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Thu, 28 Jul 2022 12:48:46 +0200 Subject: [PATCH 01/25] Added support for returning optimade data in the hdf5 format. --- docs/api_reference/adapters/hdf5.md | 3 + optimade/adapters/hdf5.py | 175 ++++++++++++++++++ optimade/models/jsonapi.py | 7 + .../entry_collections/entry_collections.py | 8 +- optimade/server/middleware.py | 3 +- optimade/server/routers/utils.py | 23 ++- tests/server/conftest.py | 43 ++++- .../query_params/test_response_format.py | 58 ++++++ 8 files changed, 303 insertions(+), 17 deletions(-) create mode 100644 docs/api_reference/adapters/hdf5.md create mode 100644 optimade/adapters/hdf5.py create mode 100644 tests/server/query_params/test_response_format.py diff --git a/docs/api_reference/adapters/hdf5.md b/docs/api_reference/adapters/hdf5.md new file mode 100644 index 000000000..677c355ef --- /dev/null +++ b/docs/api_reference/adapters/hdf5.md @@ -0,0 +1,3 @@ +# hdf5 + +::: optimade.adapters.hdf5 diff --git a/optimade/adapters/hdf5.py b/optimade/adapters/hdf5.py new file mode 100644 index 000000000..61a567e79 --- /dev/null +++ b/optimade/adapters/hdf5.py @@ -0,0 +1,175 @@ +from io import BytesIO +from typing import Union +from pydantic import AnyUrl +from datetime import datetime, timezone +from optimade.models import EntryResponseMany, EntryResponseOne +import h5py +import numpy as np + + +def generate_hdf5_file_content( + response_object: Union[EntryResponseMany, EntryResponseOne, dict, list, tuple] +) -> bytes: + """This function generates the content of a hdf5 file from an EntryResponse object. + It should also be able to handle python dictionaries lists and tuples.""" + + temp_file = BytesIO() + hdf5_file = h5py.File(temp_file, "w") + if isinstance(response_object, (EntryResponseMany, EntryResponseOne)): + response_object = response_object.dict(exclude_unset=True) + store_hdf5_dict(hdf5_file, response_object) + hdf5_file.close() + file_content = temp_file.getvalue() + temp_file.close() + return file_content + + +def store_hdf5_dict(hdf5_file, iterable: Union[dict, list, tuple], group: str = ""): + """This function stores a python list, dictionary or tuple in a hdf5 file. + the currently supported datatypes are str, int, float, list, dict, tuple, bool, AnyUrl, + None ,datetime or any numpy type or numpy array as long as it does not contain a numpy object. + + Parameters: + hdf5_file: An hdf5 file like object. + iterable: The object to be stored in the hdf5 file. + group: This indicates to group in the hdf5 file the list, tuple or dictionary should be added. + """ + if isinstance(iterable, (list, tuple)): + iterable = enumerate(iterable) + elif isinstance(iterable, dict): + iterable = iterable.items() + for x in iterable: + key = str(x[0]) + value = x[1] + if isinstance( + value, (list, tuple) + ): # For now, I assume that all values in the list have the same type. + if len(value) < 1: # case empty list + hdf5_file[group + "/" + key] = [] + continue + val_type = type(value[0]) + if val_type == dict: + hdf5_file.create_group(group + "/" + key) + store_hdf5_dict(hdf5_file, value, group + "/" + key) + elif val_type.__module__ == np.__name__: + if val_type.dtype != object: + hdf5_file[group + "/" + key] = value + else: + raise ValueError( + "Cannot store numpy arrays with dtype: 'object' in hdf5." + ) + elif isinstance(value[0], (int, float)): + hdf5_file[group + "/" + key] = np.asarray(value) + elif isinstance(value[0], str): + hdf5_file[group + "/" + key] = value + elif isinstance(value[0], (list, tuple)): + list_type = get_recursive_type(value[0]) + if list_type in (int, float): + hdf5_file[group + "/" + key] = np.asarray(value) + else: + hdf5_file.create_group(group + "/" + key) + store_hdf5_dict(hdf5_file, value, group + "/" + key) + else: + raise ValueError( + f"The list with type :{val_type} cannot be converted to hdf5." + ) + elif isinstance(value, dict): + hdf5_file.create_group(group + "/" + key) + store_hdf5_dict(hdf5_file, value, group + "/" + key) + elif isinstance(value, bool): + hdf5_file[group + "/" + key] = np.bool_(value) + elif isinstance( + value, AnyUrl + ): # This case hat to be placed above the str case as AnyUrl inherits from the string class, but cannot be handled directly by h5py. + hdf5_file[group + "/" + key] = str(value) + elif isinstance( + value, + ( + int, + float, + str, + ), + ): + hdf5_file[group + "/" + key] = value + elif type(value).__module__ == np.__name__: + if value.dtype != object: + hdf5_file[group + "/" + key] = value + else: + raise ValueError( + "Cannot store numpy arrays with dtype: 'object' in hdf5." + ) + elif isinstance(value, datetime): + hdf5_file[group + "/" + key] = value.astimezone(timezone.utc).strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + elif value is None: + hdf5_file[group + "/" + key] = h5py.Empty( + "f" + ) # hdf5 does not seem to have a proper null or None type. + else: + raise ValueError(f"Do not know how to store a value of {type(value)}") + + +def get_recursive_type(obj): + if isinstance(obj, (list, tuple)): + if len(obj) == 0: + return None + else: + if isinstance(obj[0], (list, tuple)): + return get_recursive_type(obj[0]) + else: + return type(obj[0]) + return type(obj) + + +def generate_response_from_hdf5(response): + temp_file = BytesIO(response) + hdf5_file = h5py.File(temp_file, "r") + response_dict = generate_dict_from_hdf5(hdf5_file, "dict") + return response_dict + + +def generate_dict_from_hdf5(hdf5_file, value_type, dict_tree="/"): + if value_type == "dict": + return_value = {} + for key, value in hdf5_file[dict_tree].items(): + if isinstance(value, h5py._hl.group.Group): + if list(value.keys())[0].isdigit(): + new_value_type = "list" + else: + new_value_type = "dict" + return_value[key] = generate_dict_from_hdf5( + hdf5_file, new_value_type, dict_tree=dict_tree + key + "/" + ) + else: + if isinstance(value[()], h5py._hl.base.Empty): + return_value[key] = None + elif isinstance(value[()], bytes): + return_value[key] = value[()].decode() + else: + return_value[key] = value[ + () + ] # I still have to check which other types I could get. + if value_type == "list": + return_value = [] + for key, value in hdf5_file[dict_tree].items(): + if isinstance(value, h5py._hl.group.Group): + if list(value.keys())[0].isdigit(): + new_value_type = "list" + else: + new_value_type = "dict" + return_value.append( + generate_dict_from_hdf5( + hdf5_file, new_value_type, dict_tree=dict_tree + key + "/" + ) + ) + else: + if isinstance(value[()], h5py._hl.base.Empty): + return_value.append(None) + elif isinstance(value[()], bytes): + return_value.append(value[()].decode()) + else: + return_value.append( + value[()] + ) # I still have to check which other types I could get. + return return_value diff --git a/optimade/models/jsonapi.py b/optimade/models/jsonapi.py index f6724e01b..9013d8be3 100644 --- a/optimade/models/jsonapi.py +++ b/optimade/models/jsonapi.py @@ -8,6 +8,7 @@ parse_obj_as, root_validator, ) +import numpy from optimade.models.utils import StrictField @@ -365,4 +366,10 @@ class Config: datetime: lambda v: v.astimezone(timezone.utc).strftime( "%Y-%m-%dT%H:%M:%SZ" ), + numpy.int32: lambda v: int(v), + numpy.float32: lambda v: float(v), + numpy.int64: lambda v: int(v), + numpy.float64: lambda v: float(v), + numpy.bool_: lambda v: bool(v), + numpy.ndarray: lambda v: v.tolist(), } diff --git a/optimade/server/entry_collections/entry_collections.py b/optimade/server/entry_collections/entry_collections.py index b909339d8..79cd48366 100644 --- a/optimade/server/entry_collections/entry_collections.py +++ b/optimade/server/entry_collections/entry_collections.py @@ -299,12 +299,12 @@ def handle_query_params( cursor_kwargs["filter"] = {} # response_format - if ( - getattr(params, "response_format", False) - and params.response_format != "json" + if getattr(params, "response_format", False) and params.response_format not in ( + "json", + "hdf5", ): raise BadRequest( - detail=f"Response format {params.response_format} is not supported, please use response_format='json'" + detail=f"Response format {params.response_format} is not supported, please use response_format='json' or response_format='hdf5'" ) # page_limit diff --git a/optimade/server/middleware.py b/optimade/server/middleware.py index 56ce24701..c68a06c0c 100644 --- a/optimade/server/middleware.py +++ b/optimade/server/middleware.py @@ -447,7 +447,8 @@ async def dispatch(self, request: Request, call_next): if not isinstance(chunk, bytes): chunk = chunk.encode(charset) body += chunk - body = body.decode(charset) + if response.raw_headers[1][1] == b"application/vnd.api+json": + body = body.decode(charset) if self._warnings: response = json.loads(body) diff --git a/optimade/server/routers/utils.py b/optimade/server/routers/utils.py index dd5f9ff09..d4c4b51ae 100644 --- a/optimade/server/routers/utils.py +++ b/optimade/server/routers/utils.py @@ -4,7 +4,7 @@ from datetime import datetime from typing import Any, Dict, List, Set, Union -from fastapi import Request +from fastapi import Request, Response from fastapi.responses import JSONResponse from starlette.datastructures import URL as StarletteURL @@ -22,6 +22,7 @@ from optimade.server.exceptions import BadRequest, InternalServerError from optimade.server.query_params import EntryListingQueryParams, SingleEntryQueryParams from optimade.utils import mongo_id_for_database, get_providers, PROVIDER_LIST_URLS +from optimade.adapters.hdf5 import generate_hdf5_file_content __all__ = ( "BASE_URL_PREFIXES", @@ -265,7 +266,7 @@ def get_entries( if fields or include_fields: results = handle_response_fields(results, fields, include_fields) - return response( + response_object = response( links=links, data=results, meta=meta_values( @@ -277,6 +278,14 @@ def get_entries( ), included=included, ) + if params.response_format == "json": + return response_object + elif params.response_format == "hdf5": + return Response( + content=generate_hdf5_file_content(response_object), + media_type="application/x-hdf5", + headers={"Content-Disposition": "attachment"}, + ) def get_single_entry( @@ -313,7 +322,7 @@ def get_single_entry( if fields or include_fields and results is not None: results = handle_response_fields(results, fields, include_fields)[0] - return response( + response_object = response( links=links, data=results, meta=meta_values( @@ -325,3 +334,11 @@ def get_single_entry( ), included=included, ) + if params.response_format == "json": + return response_object + elif params.response_format == "hdf5": + return Response( + content=generate_hdf5_file_content(response_object), + media_type="application/x-hdf5", + headers={"Content-Disposition": "attachment"}, + ) diff --git a/tests/server/conftest.py b/tests/server/conftest.py index ce013e1b9..813333e14 100644 --- a/tests/server/conftest.py +++ b/tests/server/conftest.py @@ -1,8 +1,7 @@ +import pytest from typing import Union, Dict from optimade.server.warnings import OptimadeWarning - - -import pytest +from optimade.adapters.hdf5 import generate_response_from_hdf5 @pytest.fixture(scope="session") @@ -74,13 +73,17 @@ def inner( pytest.fail("'server' must be either a string or an OptimadeTestClient.") try: + expected_mime_type = get_expeced_response_format(request) response = used_client.get(request, **kwargs) - response_json = response.json() - assert response.status_code == 200, f"Request failed: {response_json}" - expected_mime_type = "application/vnd.api+json" assert ( response.headers["content-type"] == expected_mime_type ), f"Response should have MIME type {expected_mime_type!r}, not {response.headers['content-type']!r}." + if expected_mime_type == "application/vnd.api+json": + response_dict = response.json() + else: + response_dict = generate_response_from_hdf5(response.content) + assert response.status_code == 200, f"Request failed: {response_dict}" + except json.JSONDecodeError: print( f"Request attempted:\n{used_client.base_url}{used_client.version}" @@ -96,12 +99,31 @@ def inner( raise exc else: if return_json: - return response_json + return response_dict return response return inner +def get_expeced_response_format(request: str) -> str: + """This function tries to extract the MIME type from the request string. + If it is unable to do so it returns the default json MIME type. + + Parameters: + request: The request from which the mime type should be extracted. + """ + + expected_mime_type = "application/vnd.api+json" + response_format_start = request.find("response_format") + if response_format_start > -1 and len(request[response_format_start:]) > 15: + returntype = ( + request[15 + response_format_start :].split("=")[1].split("&")[0].strip() + ) + if returntype == "hdf5": + expected_mime_type = "application/x-hdf5" + return expected_mime_type + + @pytest.fixture def check_response(get_good_response): """Check response matches expectations for a given request. @@ -193,17 +215,20 @@ def inner( pytest.fail("'server' must be either a string or an OptimadeTestClient.") try: + expected_mime_type = get_expeced_response_format(request) response = used_client.get(request) assert response.status_code == expected_status, ( f"Request should have been an error with status code {expected_status}, " f"but instead {response.status_code} was received.\nResponse:\n{response.json()}", ) - expected_mime_type = "application/vnd.api+json" assert ( response.headers["content-type"] == expected_mime_type ), f"Response should have MIME type {expected_mime_type!r}, not {response.headers['content-type']!r}." - response = response.json() + if expected_mime_type == "application/vnd.api+json": + response = response.json() + else: + response = generate_response_from_hdf5(response.content) assert len(response["errors"]) == 1, response.get( "errors", "'errors' not found in response" ) diff --git a/tests/server/query_params/test_response_format.py b/tests/server/query_params/test_response_format.py new file mode 100644 index 000000000..89f20d115 --- /dev/null +++ b/tests/server/query_params/test_response_format.py @@ -0,0 +1,58 @@ +import numpy +from pydantic import AnyUrl +from pydantic.tools import parse_obj_as +from datetime import datetime +from optimade.models import ReferenceResponseOne +from tests.server.utils import RegularEndpointTests +from optimade.adapters.hdf5 import ( + generate_hdf5_file_content, + generate_response_from_hdf5, +) +from fastapi.encoders import jsonable_encoder +from optimade.models.jsonapi import Response + + +def test_response_format(check_response): + request = '/structures?filter=_exmpl_chemsys="Ac"&response_format=json' + expected_ids = ["mpf_1"] + check_response(request, expected_ids) + + request = '/structures?filter=_exmpl_chemsys="Ac"&response_format=hdf5' + check_response(request, expected_ids) + + +class TestSingleReferenceEndpoint(RegularEndpointTests): + test_id = "dijkstra1968" + request_str = f"/references/{test_id}&response_format=hdf5" + response_cls = ReferenceResponseOne + + +def test_convert_to_hdf5_and_back(): + test_dict = { + "int": 1, + "float": 5.26, + "string": "str", + "datetime": datetime.now(), + "list": [[2.3, 4.5], [8.9, 5.6]], + "dict": {"a key": "a value", "another key": 7.33}, + "tuple": (95, 63), + "bool": False, + "AnyUrl": parse_obj_as(AnyUrl, "https://example.com"), + "None": None, + "empty": [], + "numpy_int64": numpy.int64(42), + "numpy_float32": numpy.float32(0.88153), + "numpy_bool": numpy.bool(True), + "numpy_array": numpy.array([(1, 2), (3, 4)]), + } + + hdf5_file_content = generate_hdf5_file_content(test_dict) + + returned_dict = generate_response_from_hdf5(hdf5_file_content) + reference_dict = jsonable_encoder( + test_dict, custom_encoder=Response.Config.json_encoders + ) + returned_dict = jsonable_encoder( + returned_dict, custom_encoder=Response.Config.json_encoders + ) + assert reference_dict == returned_dict From 079bd713546e2ad650c27aacc2cff16a58387f35 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Fri, 29 Jul 2022 15:39:25 +0200 Subject: [PATCH 02/25] Added extra doctstrings to hdf5.py and made setting for enabling/disabling hdf5 response format. --- optimade/adapters/hdf5.py | 140 ++++++++++++------ optimade/server/config.py | 4 + .../entry_collections/entry_collections.py | 8 +- 3 files changed, 104 insertions(+), 48 deletions(-) diff --git a/optimade/adapters/hdf5.py b/optimade/adapters/hdf5.py index 61a567e79..b8a8776aa 100644 --- a/optimade/adapters/hdf5.py +++ b/optimade/adapters/hdf5.py @@ -1,5 +1,5 @@ from io import BytesIO -from typing import Union +from typing import Union, Any from pydantic import AnyUrl from datetime import datetime, timezone from optimade.models import EntryResponseMany, EntryResponseOne @@ -7,11 +7,33 @@ import numpy as np +"""This adaptor class can be used to generate a hdf5 response instead of a json response and to convert the hdf5 response back into an python dictionary. +It can handle numeric data in a binary format compatible with numpy. +It is therefore more efficient than the JSON format at returning large amounts of numeric data. +It however also has more overhead resulting in a larger response for entries with little numeric data. +To enable support for your server the parameter "enabled_response_formats" can be specified in the config file. +It is a list of the supported response_formats. To support the hdf5 return format it should be set to: ["json", "hdf5"] +(support for the JSON format is mandatory) + +Unfortunately, h5py does not support storing objects with the numpy.object type. +It is therefore not possible to directly store a list of dictionaries in a hdf5 file with h5py. +As a workaround, the index of a value in a list is used as a dictionary key so a list can be stored as a dictionary if neccesary. +""" + + def generate_hdf5_file_content( response_object: Union[EntryResponseMany, EntryResponseOne, dict, list, tuple] ) -> bytes: """This function generates the content of a hdf5 file from an EntryResponse object. - It should also be able to handle python dictionaries lists and tuples.""" + It should also be able to handle python dictionaries lists and tuples. + + Parameters: + response_object: an OPTIMADE response object. This can be of any OPTIMADE entry type, such as structure, + reference etc. + + Returns: + A binary object containing the contents of the hdf5 file. + """ temp_file = BytesIO() hdf5_file = h5py.File(temp_file, "w") @@ -27,12 +49,20 @@ def generate_hdf5_file_content( def store_hdf5_dict(hdf5_file, iterable: Union[dict, list, tuple], group: str = ""): """This function stores a python list, dictionary or tuple in a hdf5 file. the currently supported datatypes are str, int, float, list, dict, tuple, bool, AnyUrl, - None ,datetime or any numpy type or numpy array as long as it does not contain a numpy object. + None ,datetime or any numpy type or numpy array. + + Unfortunately, h5py does not support storing objects with the numpy.object type. + It is therefore not possible to directly store a list of dictionaries in a hdf5 file with h5py. + As a workaround, the index of a value in a list is used as a dictionary key so a list can be stored as a dictionary if neccesary. Parameters: hdf5_file: An hdf5 file like object. iterable: The object to be stored in the hdf5 file. group: This indicates to group in the hdf5 file the list, tuple or dictionary should be added. + + Raises: + ValueError: If this function encounters an object with a type that it cannot convert to the hdf5 format + a ValueError is raised. """ if isinstance(iterable, (list, tuple)): iterable = enumerate(iterable) @@ -110,7 +140,17 @@ def store_hdf5_dict(hdf5_file, iterable: Union[dict, list, tuple], group: str = raise ValueError(f"Do not know how to store a value of {type(value)}") -def get_recursive_type(obj): +def get_recursive_type(obj: Any) -> type: + """If obj is a list or tuple this function returns the type of the first object in the list/tuple that is not a list + or tuple. If the list or tuple is empty it returns None. + Finally if the object is not a list or tuple it returns the type of the object. + + Parameters: + obj: any python object + + Returns: + The type of the objects that the object contains or the type of the object itself when it does not contain other objects.""" + if isinstance(obj, (list, tuple)): if len(obj) == 0: return None @@ -122,54 +162,66 @@ def get_recursive_type(obj): return type(obj) -def generate_response_from_hdf5(response): - temp_file = BytesIO(response) +def generate_response_from_hdf5(hdf5_content: bytes) -> dict: + """Generates a response_dict from a HDF5 file like object. + It is similar to the response_dict generated from the JSON response, except that the numerical data will have numpy + types. + + Parameters: + hdf5_content(bytes): the content of a hdf5 file. + + Returns: + A dictionary containing the data of the hdf5 file.""" + + temp_file = BytesIO(hdf5_content) hdf5_file = h5py.File(temp_file, "r") - response_dict = generate_dict_from_hdf5(hdf5_file, "dict") + response_dict = generate_dict_from_hdf5(hdf5_file) return response_dict -def generate_dict_from_hdf5(hdf5_file, value_type, dict_tree="/"): - if value_type == "dict": - return_value = {} - for key, value in hdf5_file[dict_tree].items(): +def generate_dict_from_hdf5( + hdf5_file: h5py._hl.files.File, group: str = "/" +) -> Union[dict, list]: + """This function returns the content of a hdf5 group. + Because of the workaround described under the store_hdf5_dict function, groups which have numbers as keys will be turned to lists(No guartee that the order is the same as in th eoriginal list). + Otherwise, the group will be turned into a dict. + + Parameters: + hdf5_file: An HDF5_object containing the data that should be converted to a dictionary or list. + group: The hdf5 group for which the dictionary should be created. The default is "/" which will return all the data in the hdf5_object + + Returns: + A dict or list containing the content of the hdf5 group. + """ + + return_value = None + for key, value in hdf5_file[group].items(): + if key.isdigit(): + if return_value is None: + return_value = [] if isinstance(value, h5py._hl.group.Group): - if list(value.keys())[0].isdigit(): - new_value_type = "list" - else: - new_value_type = "dict" - return_value[key] = generate_dict_from_hdf5( - hdf5_file, new_value_type, dict_tree=dict_tree + key + "/" + return_value.append( + generate_dict_from_hdf5(hdf5_file, group=group + key + "/") ) + elif isinstance(value[()], h5py._hl.base.Empty): + return_value.append(None) + elif isinstance(value[()], bytes): + return_value.append(value[()].decode()) else: - if isinstance(value[()], h5py._hl.base.Empty): - return_value[key] = None - elif isinstance(value[()], bytes): - return_value[key] = value[()].decode() - else: - return_value[key] = value[ - () - ] # I still have to check which other types I could get. - if value_type == "list": - return_value = [] - for key, value in hdf5_file[dict_tree].items(): + return_value.append(value[()]) + + else: # Case dictionary + if return_value is None: + return_value = {} if isinstance(value, h5py._hl.group.Group): - if list(value.keys())[0].isdigit(): - new_value_type = "list" - else: - new_value_type = "dict" - return_value.append( - generate_dict_from_hdf5( - hdf5_file, new_value_type, dict_tree=dict_tree + key + "/" - ) + return_value[key] = generate_dict_from_hdf5( + hdf5_file, group=group + key + "/" ) + elif isinstance(value[()], h5py._hl.base.Empty): + return_value[key] = None + elif isinstance(value[()], bytes): + return_value[key] = value[()].decode() else: - if isinstance(value[()], h5py._hl.base.Empty): - return_value.append(None) - elif isinstance(value[()], bytes): - return_value.append(value[()].decode()) - else: - return_value.append( - value[()] - ) # I still have to check which other types I could get. + return_value[key] = value[()] + return return_value diff --git a/optimade/server/config.py b/optimade/server/config.py index ac81692e9..de14479ce 100644 --- a/optimade/server/config.py +++ b/optimade/server/config.py @@ -280,6 +280,10 @@ class ServerConfig(BaseSettings): True, description="If True, the server will check whether the query parameters given in the request are correct.", ) + enabled_response_formats: Optional[List[str]] = Field( + ["json"], + description="""A list of the response formats that are supported by this server. Must include the "json" format.""", + ) @validator("implementation", pre=True) def set_implementation_version(cls, v): diff --git a/optimade/server/entry_collections/entry_collections.py b/optimade/server/entry_collections/entry_collections.py index 79cd48366..0c6de2d50 100644 --- a/optimade/server/entry_collections/entry_collections.py +++ b/optimade/server/entry_collections/entry_collections.py @@ -299,12 +299,12 @@ def handle_query_params( cursor_kwargs["filter"] = {} # response_format - if getattr(params, "response_format", False) and params.response_format not in ( - "json", - "hdf5", + if ( + getattr(params, "response_format", False) + and params.response_format not in CONFIG.enabled_response_formats ): raise BadRequest( - detail=f"Response format {params.response_format} is not supported, please use response_format='json' or response_format='hdf5'" + detail=f"Response format {params.response_format} is not supported, please use one of the supported response_formats: {','.join(CONFIG.enabled_response_formats)}" ) # page_limit From 0b71e9e4eb0c02f7a0a9b38216d3cc5d5ef144ee Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Fri, 29 Jul 2022 16:26:41 +0200 Subject: [PATCH 03/25] Added dependancies for hdf5 response to requirements.txt and setup.py. --- requirements.txt | 2 ++ setup.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 805d4cea5..5bb56a5cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,10 @@ elasticsearch-dsl==7.4.0 email_validator==1.2.1 fastapi==0.79.0 +h5py==3.7.0 lark==1.1.2 mongomock==4.1.2 +numpy==1.23.0 pydantic==1.9.1 pymongo==4.1.1 pyyaml==5.4 diff --git a/setup.py b/setup.py index f8fee0d62..5eb216c33 100644 --- a/setup.py +++ b/setup.py @@ -19,10 +19,15 @@ # Server minded elastic_deps = ["elasticsearch-dsl~=7.4,<8.0"] mongo_deps = ["pymongo>=3.12.1,<5", "mongomock~=4.1"] -server_deps = [ - "uvicorn~=0.18", - "pyyaml>=5.4,<7", # Keep at pyyaml 5.4 for aiida-core support -] + mongo_deps +hdf5_deps = ["h5py==3.7.0", "numpy==1.23.0"] +server_deps = ( + [ + "uvicorn~=0.18", + "pyyaml>=5.4,<7", # Keep at pyyaml 5.4 for aiida-core support + ] + + mongo_deps + + hdf5_deps +) # Client minded From 9167351a109e51f9e685f7495b96ebc1613d08d7 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Fri, 29 Jul 2022 18:27:00 +0200 Subject: [PATCH 04/25] Added enabled_response_formats to test config and disabled hdf5 tests incase hdf5 is not in the enabled_response_formats. --- .../query_params/test_response_format.py | 80 ++++++++++--------- tests/test_config.json | 3 +- 2 files changed, 45 insertions(+), 38 deletions(-) diff --git a/tests/server/query_params/test_response_format.py b/tests/server/query_params/test_response_format.py index 89f20d115..eaf9a4dc4 100644 --- a/tests/server/query_params/test_response_format.py +++ b/tests/server/query_params/test_response_format.py @@ -10,6 +10,7 @@ ) from fastapi.encoders import jsonable_encoder from optimade.models.jsonapi import Response +from optimade.server.config import CONFIG def test_response_format(check_response): @@ -17,42 +18,47 @@ def test_response_format(check_response): expected_ids = ["mpf_1"] check_response(request, expected_ids) - request = '/structures?filter=_exmpl_chemsys="Ac"&response_format=hdf5' - check_response(request, expected_ids) + if "hdf5" in CONFIG.enabled_response_formats: + request = '/structures?filter=_exmpl_chemsys="Ac"&response_format=hdf5' + check_response(request, expected_ids) + + +if "hdf5" in CONFIG.enabled_response_formats: + + class TestSingleReferenceEndpoint(RegularEndpointTests): + test_id = "dijkstra1968" + request_str = f"/references/{test_id}&response_format=hdf5" + response_cls = ReferenceResponseOne + + +if "hdf5" in CONFIG.enabled_response_formats: + + def test_convert_to_hdf5_and_back(): + test_dict = { + "int": 1, + "float": 5.26, + "string": "str", + "datetime": datetime.now(), + "list": [[2.3, 4.5], [8.9, 5.6]], + "dict": {"a key": "a value", "another key": 7.33}, + "tuple": (95, 63), + "bool": False, + "AnyUrl": parse_obj_as(AnyUrl, "https://example.com"), + "None": None, + "empty": [], + "numpy_int64": numpy.int64(42), + "numpy_float32": numpy.float32(0.88153), + "numpy_bool": numpy.bool(True), + "numpy_array": numpy.array([(1, 2), (3, 4)]), + } + hdf5_file_content = generate_hdf5_file_content(test_dict) -class TestSingleReferenceEndpoint(RegularEndpointTests): - test_id = "dijkstra1968" - request_str = f"/references/{test_id}&response_format=hdf5" - response_cls = ReferenceResponseOne - - -def test_convert_to_hdf5_and_back(): - test_dict = { - "int": 1, - "float": 5.26, - "string": "str", - "datetime": datetime.now(), - "list": [[2.3, 4.5], [8.9, 5.6]], - "dict": {"a key": "a value", "another key": 7.33}, - "tuple": (95, 63), - "bool": False, - "AnyUrl": parse_obj_as(AnyUrl, "https://example.com"), - "None": None, - "empty": [], - "numpy_int64": numpy.int64(42), - "numpy_float32": numpy.float32(0.88153), - "numpy_bool": numpy.bool(True), - "numpy_array": numpy.array([(1, 2), (3, 4)]), - } - - hdf5_file_content = generate_hdf5_file_content(test_dict) - - returned_dict = generate_response_from_hdf5(hdf5_file_content) - reference_dict = jsonable_encoder( - test_dict, custom_encoder=Response.Config.json_encoders - ) - returned_dict = jsonable_encoder( - returned_dict, custom_encoder=Response.Config.json_encoders - ) - assert reference_dict == returned_dict + returned_dict = generate_response_from_hdf5(hdf5_file_content) + reference_dict = jsonable_encoder( + test_dict, custom_encoder=Response.Config.json_encoders + ) + returned_dict = jsonable_encoder( + returned_dict, custom_encoder=Response.Config.json_encoders + ) + assert reference_dict == returned_dict diff --git a/tests/test_config.json b/tests/test_config.json index 84e05066c..59e8c5c29 100644 --- a/tests/test_config.json +++ b/tests/test_config.json @@ -34,5 +34,6 @@ "structures": { "chemsys": "nelements" } - } + }, + "enabled_response_formats": ["json","hdf5"] } From 7551132931c0c75f152d00cd60c3876ad815cdc4 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Fri, 29 Jul 2022 18:27:00 +0200 Subject: [PATCH 05/25] Added enabled_response_formats to test config and disabled hdf5 tests incase hdf5 is not in the enabled_response_formats. --- .../query_params/test_response_format.py | 80 ++++++++++--------- tests/test_config.json | 3 +- 2 files changed, 45 insertions(+), 38 deletions(-) diff --git a/tests/server/query_params/test_response_format.py b/tests/server/query_params/test_response_format.py index 89f20d115..460f955ce 100644 --- a/tests/server/query_params/test_response_format.py +++ b/tests/server/query_params/test_response_format.py @@ -10,6 +10,7 @@ ) from fastapi.encoders import jsonable_encoder from optimade.models.jsonapi import Response +from optimade.server.config import CONFIG def test_response_format(check_response): @@ -17,42 +18,47 @@ def test_response_format(check_response): expected_ids = ["mpf_1"] check_response(request, expected_ids) - request = '/structures?filter=_exmpl_chemsys="Ac"&response_format=hdf5' - check_response(request, expected_ids) + if "hdf5" in CONFIG.enabled_response_formats: + request = '/structures?filter=_exmpl_chemsys="Ac"&response_format=hdf5' + check_response(request, expected_ids) + + +if "hdf5" in CONFIG.enabled_response_formats: + + class TestSingleReferenceEndpoint(RegularEndpointTests): + test_id = "dijkstra1968" + request_str = f"/references/{test_id}&response_format=hdf5" + response_cls = ReferenceResponseOne + + +if "hdf5" in CONFIG.enabled_response_formats: + + def test_convert_to_hdf5_and_back(): + test_dict = { + "int": 1, + "float": 5.26, + "string": "str", + "datetime": datetime.now(), + "list": [[2.3, 4.5], [8.9, 5.6]], + "dict": {"a key": "a value", "another key": 7.33}, + "tuple": (95, 63), + "bool": False, + "AnyUrl": parse_obj_as(AnyUrl, "https://example.com"), + "None": None, + "empty": [], + "numpy_int64": numpy.int64(42), + "numpy_float32": numpy.float32(0.88153), + "numpy_bool": numpy.bool_(True), + "numpy_array": numpy.array([(1, 2), (3, 4)]), + } + hdf5_file_content = generate_hdf5_file_content(test_dict) -class TestSingleReferenceEndpoint(RegularEndpointTests): - test_id = "dijkstra1968" - request_str = f"/references/{test_id}&response_format=hdf5" - response_cls = ReferenceResponseOne - - -def test_convert_to_hdf5_and_back(): - test_dict = { - "int": 1, - "float": 5.26, - "string": "str", - "datetime": datetime.now(), - "list": [[2.3, 4.5], [8.9, 5.6]], - "dict": {"a key": "a value", "another key": 7.33}, - "tuple": (95, 63), - "bool": False, - "AnyUrl": parse_obj_as(AnyUrl, "https://example.com"), - "None": None, - "empty": [], - "numpy_int64": numpy.int64(42), - "numpy_float32": numpy.float32(0.88153), - "numpy_bool": numpy.bool(True), - "numpy_array": numpy.array([(1, 2), (3, 4)]), - } - - hdf5_file_content = generate_hdf5_file_content(test_dict) - - returned_dict = generate_response_from_hdf5(hdf5_file_content) - reference_dict = jsonable_encoder( - test_dict, custom_encoder=Response.Config.json_encoders - ) - returned_dict = jsonable_encoder( - returned_dict, custom_encoder=Response.Config.json_encoders - ) - assert reference_dict == returned_dict + returned_dict = generate_response_from_hdf5(hdf5_file_content) + reference_dict = jsonable_encoder( + test_dict, custom_encoder=Response.Config.json_encoders + ) + returned_dict = jsonable_encoder( + returned_dict, custom_encoder=Response.Config.json_encoders + ) + assert reference_dict == returned_dict diff --git a/tests/test_config.json b/tests/test_config.json index 84e05066c..59e8c5c29 100644 --- a/tests/test_config.json +++ b/tests/test_config.json @@ -34,5 +34,6 @@ "structures": { "chemsys": "nelements" } - } + }, + "enabled_response_formats": ["json","hdf5"] } From 79520920dc6240e151233a981f82d4a4525bc343 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Fri, 29 Jul 2022 19:06:24 +0200 Subject: [PATCH 06/25] checking whether the not installing of numpy on github server was caused by having two different versions in setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 5eb216c33..595068491 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ # Server minded elastic_deps = ["elasticsearch-dsl~=7.4,<8.0"] mongo_deps = ["pymongo>=3.12.1,<5", "mongomock~=4.1"] -hdf5_deps = ["h5py==3.7.0", "numpy==1.23.0"] +hdf5_deps = ["h5py==3.7.0", "numpy~=1.23"] server_deps = ( [ "uvicorn~=0.18", @@ -40,7 +40,7 @@ "click~=8.1", ] ase_deps = ["ase~=3.22"] -cif_deps = ["numpy~=1.21"] +cif_deps = ["numpy~=1.23"] pdb_deps = cif_deps pymatgen_deps = ["pymatgen==2022.0.16"] jarvis_deps = ["jarvis-tools==2022.5.20"] From 694894faf823cdfcc7d5b9b1fefa0ea9a3c08480 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Fri, 29 Jul 2022 19:12:06 +0200 Subject: [PATCH 07/25] added hdf5_deps to extras_require. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 595068491..b7f41eaa8 100644 --- a/setup.py +++ b/setup.py @@ -122,6 +122,7 @@ "client": client_deps, "elastic": elastic_deps, "mongo": mongo_deps, + "hdf5": hdf5_deps, "aiida": aiida_deps, "ase": ase_deps, "cif": cif_deps, From 8d51f557bc3024ceeb789bca0b0469ad36aef2aa Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Fri, 29 Jul 2022 19:20:05 +0200 Subject: [PATCH 08/25] Added numpy and h5py to install_requirements in setup.py --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index b7f41eaa8..3eac3a707 100644 --- a/setup.py +++ b/setup.py @@ -111,6 +111,8 @@ "pydantic~=1.9", "email_validator~=1.2", "requests~=2.28", + "numpy~=1.23", + "h5py~=3.7", ], extras_require={ "all": all_deps, From 12b79e02febc9487c5541a8ce7d4fb5aa6e81f0d Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Fri, 29 Jul 2022 19:48:09 +0200 Subject: [PATCH 09/25] Use a query that does not have an _exampl_ field to test response format. --- tests/server/query_params/test_response_format.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/server/query_params/test_response_format.py b/tests/server/query_params/test_response_format.py index 460f955ce..2429b8e78 100644 --- a/tests/server/query_params/test_response_format.py +++ b/tests/server/query_params/test_response_format.py @@ -14,12 +14,16 @@ def test_response_format(check_response): - request = '/structures?filter=_exmpl_chemsys="Ac"&response_format=json' + request = ( + '/structures?filter=chemical_formula_descriptive="Ac"&response_format=json' + ) expected_ids = ["mpf_1"] check_response(request, expected_ids) if "hdf5" in CONFIG.enabled_response_formats: - request = '/structures?filter=_exmpl_chemsys="Ac"&response_format=hdf5' + request = ( + '/structures?filter=chemical_formula_descriptive="Ac"&response_format=hdf5' + ) check_response(request, expected_ids) From 9fe4dccd2b8a12b4e60576736bae9fa9ac0a1846 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Wed, 3 Aug 2022 19:26:08 +0200 Subject: [PATCH 10/25] Added extra test and the supported response formats are now listed at the info endpoint. --- optimade/adapters/hdf5.py | 30 ++++++++++--------- optimade/server/routers/info.py | 7 +++-- .../query_params/test_response_format.py | 13 +++++++- 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/optimade/adapters/hdf5.py b/optimade/adapters/hdf5.py index b8a8776aa..d6a98698d 100644 --- a/optimade/adapters/hdf5.py +++ b/optimade/adapters/hdf5.py @@ -61,7 +61,7 @@ def store_hdf5_dict(hdf5_file, iterable: Union[dict, list, tuple], group: str = group: This indicates to group in the hdf5 file the list, tuple or dictionary should be added. Raises: - ValueError: If this function encounters an object with a type that it cannot convert to the hdf5 format + TypeError: If this function encounters an object with a type that it cannot convert to the hdf5 format a ValueError is raised. """ if isinstance(iterable, (list, tuple)): @@ -82,11 +82,12 @@ def store_hdf5_dict(hdf5_file, iterable: Union[dict, list, tuple], group: str = hdf5_file.create_group(group + "/" + key) store_hdf5_dict(hdf5_file, value, group + "/" + key) elif val_type.__module__ == np.__name__: - if val_type.dtype != object: + try: hdf5_file[group + "/" + key] = value - else: - raise ValueError( - "Cannot store numpy arrays with dtype: 'object' in hdf5." + except (TypeError) as hdf5_error: + raise TypeError( + "Unfortunatly more complex numpy types like object can not yet be stored in hdf5. Error from hdf5:" + + hdf5_error ) elif isinstance(value[0], (int, float)): hdf5_file[group + "/" + key] = np.asarray(value) @@ -110,7 +111,7 @@ def store_hdf5_dict(hdf5_file, iterable: Union[dict, list, tuple], group: str = hdf5_file[group + "/" + key] = np.bool_(value) elif isinstance( value, AnyUrl - ): # This case hat to be placed above the str case as AnyUrl inherits from the string class, but cannot be handled directly by h5py. + ): # This case had to be placed above the str case as AnyUrl inherits from the string class, but cannot be handled directly by h5py. hdf5_file[group + "/" + key] = str(value) elif isinstance( value, @@ -122,22 +123,23 @@ def store_hdf5_dict(hdf5_file, iterable: Union[dict, list, tuple], group: str = ): hdf5_file[group + "/" + key] = value elif type(value).__module__ == np.__name__: - if value.dtype != object: + try: hdf5_file[group + "/" + key] = value - else: - raise ValueError( - "Cannot store numpy arrays with dtype: 'object' in hdf5." + except (TypeError) as hdf5_error: + raise TypeError( + "Unfortunatly more complex numpy types like object can not yet be stored in hdf5. Error from hdf5:" + + hdf5_error ) elif isinstance(value, datetime): hdf5_file[group + "/" + key] = value.astimezone(timezone.utc).strftime( "%Y-%m-%dT%H:%M:%SZ" ) elif value is None: - hdf5_file[group + "/" + key] = h5py.Empty( - "f" - ) # hdf5 does not seem to have a proper null or None type. + hdf5_file[group + "/" + key] = h5py.Empty("f") else: - raise ValueError(f"Do not know how to store a value of {type(value)}") + raise ValueError( + f"Unable to store a value of type: {type(value)} in hdf5 format." + ) def get_recursive_type(obj: Any) -> type: diff --git a/optimade/server/routers/info.py b/optimade/server/routers/info.py index ddd48adfd..aa43d603b 100644 --- a/optimade/server/routers/info.py +++ b/optimade/server/routers/info.py @@ -40,7 +40,7 @@ def get_info(request: Request) -> InfoResponse: "version": __api_version__, } ], - formats=["json"], + formats=CONFIG.enabled_response_formats, available_endpoints=["info", "links"] + list(ENTRY_INFO_SCHEMAS.keys()), entry_types_by_format={"json": list(ENTRY_INFO_SCHEMAS.keys())}, is_index=False, @@ -71,8 +71,9 @@ def get_entry_info(request: Request, entry: str) -> EntryInfoResponse: properties = retrieve_queryable_properties( schema, queryable_properties, entry_type=entry ) - - output_fields_by_format = {"json": list(properties.keys())} + output_fields_by_format = {} + for outputformat in CONFIG.enabled_response_formats: + output_fields_by_format[outputformat] = list(properties.keys()) return EntryInfoResponse( meta=meta_values( diff --git a/tests/server/query_params/test_response_format.py b/tests/server/query_params/test_response_format.py index 2429b8e78..585f459ef 100644 --- a/tests/server/query_params/test_response_format.py +++ b/tests/server/query_params/test_response_format.py @@ -43,7 +43,7 @@ def test_convert_to_hdf5_and_back(): "float": 5.26, "string": "str", "datetime": datetime.now(), - "list": [[2.3, 4.5], [8.9, 5.6]], + "list": [[[2.3, 6.3], [8.6, 4.5]], [[8.9, 9.4], [5.6, 3.5]]], "dict": {"a key": "a value", "another key": 7.33}, "tuple": (95, 63), "bool": False, @@ -66,3 +66,14 @@ def test_convert_to_hdf5_and_back(): returned_dict, custom_encoder=Response.Config.json_encoders ) assert reference_dict == returned_dict + + +def test_unsupported_response_format(check_error_response): + request = '/structures?filter=chemical_formula_descriptive="Ac"&response_format=png' + error_detail = f"Response format png is not supported, please use one of the supported response_formats: {','.join(CONFIG.enabled_response_formats)}" + check_error_response( + request, + expected_status=400, + expected_title="Bad Request", + expected_detail=error_detail, + ) From 198103241f0638f882a218d8d2e58a0e56a64aa1 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Thu, 4 Aug 2022 11:57:16 +0200 Subject: [PATCH 11/25] Made some changes to the docstrings and type definitions so it will hopefully pass the docs test on Github. --- optimade/adapters/hdf5.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/optimade/adapters/hdf5.py b/optimade/adapters/hdf5.py index d6a98698d..bb9a655e6 100644 --- a/optimade/adapters/hdf5.py +++ b/optimade/adapters/hdf5.py @@ -28,8 +28,7 @@ def generate_hdf5_file_content( It should also be able to handle python dictionaries lists and tuples. Parameters: - response_object: an OPTIMADE response object. This can be of any OPTIMADE entry type, such as structure, - reference etc. + response_object: an OPTIMADE response object. This can be of any OPTIMADE entry type, such as structure, reference etc. Returns: A binary object containing the contents of the hdf5 file. @@ -46,7 +45,9 @@ def generate_hdf5_file_content( return file_content -def store_hdf5_dict(hdf5_file, iterable: Union[dict, list, tuple], group: str = ""): +def store_hdf5_dict( + hdf5_file: h5py._hl.files.File, iterable: Union[dict, list, tuple], group: str = "" +): """This function stores a python list, dictionary or tuple in a hdf5 file. the currently supported datatypes are str, int, float, list, dict, tuple, bool, AnyUrl, None ,datetime or any numpy type or numpy array. @@ -170,7 +171,7 @@ def generate_response_from_hdf5(hdf5_content: bytes) -> dict: types. Parameters: - hdf5_content(bytes): the content of a hdf5 file. + hdf5_content: the content of a hdf5 file. Returns: A dictionary containing the data of the hdf5 file.""" From 79b48d6ad2bf83864762a84c0a4dc1ed925c1ccd Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Thu, 4 Aug 2022 14:58:20 +0200 Subject: [PATCH 12/25] The test for the single entry point did not work. This is fixed now --- tests/server/query_params/test_response_format.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/server/query_params/test_response_format.py b/tests/server/query_params/test_response_format.py index 585f459ef..2c01fdf28 100644 --- a/tests/server/query_params/test_response_format.py +++ b/tests/server/query_params/test_response_format.py @@ -2,8 +2,6 @@ from pydantic import AnyUrl from pydantic.tools import parse_obj_as from datetime import datetime -from optimade.models import ReferenceResponseOne -from tests.server.utils import RegularEndpointTests from optimade.adapters.hdf5 import ( generate_hdf5_file_content, generate_response_from_hdf5, @@ -29,13 +27,11 @@ def test_response_format(check_response): if "hdf5" in CONFIG.enabled_response_formats: - class TestSingleReferenceEndpoint(RegularEndpointTests): - test_id = "dijkstra1968" - request_str = f"/references/{test_id}&response_format=hdf5" - response_cls = ReferenceResponseOne - - -if "hdf5" in CONFIG.enabled_response_formats: + def test_single_entry(check_response): + """For single entry. Default value for `include` is 'references'""" + request = "/structures/mpf_1?response_format=hdf5" + expected_ids = "mpf_1" + check_response(request, expected_ids) def test_convert_to_hdf5_and_back(): test_dict = { From 687ea78436a626bced6461caf7d72e1be8d5cfe0 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Thu, 4 Aug 2022 20:01:55 +0200 Subject: [PATCH 13/25] Added more thorough check to see whetehr the response contnet type is json. --- optimade/server/middleware.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/optimade/server/middleware.py b/optimade/server/middleware.py index c68a06c0c..c6eb00fc1 100644 --- a/optimade/server/middleware.py +++ b/optimade/server/middleware.py @@ -447,8 +447,13 @@ async def dispatch(self, request: Request, call_next): if not isinstance(chunk, bytes): chunk = chunk.encode(charset) body += chunk - if response.raw_headers[1][1] == b"application/vnd.api+json": - body = body.decode(charset) + for i in range(len(response.raw_headers)): + if ( + response.raw_headers[i][0] == b"content-type" + and response.raw_headers[i][1] == b"application/vnd.api+json" + ): + body = body.decode(charset) + break if self._warnings: response = json.loads(body) From fbfe0f72072b2190ec23594e0c0bababb5f60232 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Thu, 4 Aug 2022 22:07:24 +0200 Subject: [PATCH 14/25] Remove numpy and h5py from 'install_requires'. --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index 3eac3a707..b7f41eaa8 100644 --- a/setup.py +++ b/setup.py @@ -111,8 +111,6 @@ "pydantic~=1.9", "email_validator~=1.2", "requests~=2.28", - "numpy~=1.23", - "h5py~=3.7", ], extras_require={ "all": all_deps, From a55bd82b5bf4a6c5a659d06a20555b1016f8f8a9 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Thu, 4 Aug 2022 22:16:42 +0200 Subject: [PATCH 15/25] Revert "Remove numpy and h5py from 'install_requires'." This reverts commit fbfe0f72072b2190ec23594e0c0bababb5f60232. --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index b7f41eaa8..3eac3a707 100644 --- a/setup.py +++ b/setup.py @@ -111,6 +111,8 @@ "pydantic~=1.9", "email_validator~=1.2", "requests~=2.28", + "numpy~=1.23", + "h5py~=3.7", ], extras_require={ "all": all_deps, From 43e326f7e63d720f5922184d5887c357b5b3d4df Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Thu, 4 Aug 2022 22:21:01 +0200 Subject: [PATCH 16/25] Remove h5py_deps and put numpy and h5py back in install_requires. --- setup.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index 3eac3a707..8f97c176c 100644 --- a/setup.py +++ b/setup.py @@ -19,15 +19,10 @@ # Server minded elastic_deps = ["elasticsearch-dsl~=7.4,<8.0"] mongo_deps = ["pymongo>=3.12.1,<5", "mongomock~=4.1"] -hdf5_deps = ["h5py==3.7.0", "numpy~=1.23"] -server_deps = ( - [ - "uvicorn~=0.18", - "pyyaml>=5.4,<7", # Keep at pyyaml 5.4 for aiida-core support - ] - + mongo_deps - + hdf5_deps -) +server_deps = [ + "uvicorn~=0.18", + "pyyaml>=5.4,<7", # Keep at pyyaml 5.4 for aiida-core support +] + mongo_deps # Client minded @@ -124,7 +119,6 @@ "client": client_deps, "elastic": elastic_deps, "mongo": mongo_deps, - "hdf5": hdf5_deps, "aiida": aiida_deps, "ase": ase_deps, "cif": cif_deps, From 1e7e3f95c3b1af45aa49ff158dd57c4d36ffe0e6 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Tue, 9 Aug 2022 13:13:54 +0200 Subject: [PATCH 17/25] Processed comments from code review. --- optimade/adapters/hdf5.py | 2 +- optimade/server/config.py | 17 ++++++++++++++++- .../entry_collections/entry_collections.py | 4 ++-- optimade/server/routers/info.py | 13 ++++++++----- optimade/server/routers/utils.py | 8 ++++++++ setup.py | 14 +++++++++----- .../server/query_params/test_response_format.py | 2 +- 7 files changed, 45 insertions(+), 15 deletions(-) diff --git a/optimade/adapters/hdf5.py b/optimade/adapters/hdf5.py index bb9a655e6..2943b6806 100644 --- a/optimade/adapters/hdf5.py +++ b/optimade/adapters/hdf5.py @@ -7,7 +7,7 @@ import numpy as np -"""This adaptor class can be used to generate a hdf5 response instead of a json response and to convert the hdf5 response back into an python dictionary. +"""This adaptor can be used to generate a hdf5 response instead of a json response and to convert the hdf5 response back into an python dictionary. It can handle numeric data in a binary format compatible with numpy. It is therefore more efficient than the JSON format at returning large amounts of numeric data. It however also has more overhead resulting in a larger response for entries with little numeric data. diff --git a/optimade/server/config.py b/optimade/server/config.py index de14479ce..969b8e242 100644 --- a/optimade/server/config.py +++ b/optimade/server/config.py @@ -68,6 +68,18 @@ class SupportedBackend(Enum): MONGOMOCK = "mongomock" +class SupportedResponseFormats(Enum): + """Enumeration of supported database backends + + - 'JSON': [JSON](https://www.json.org/json-en.html) + - 'HDF5': [HDF5](https://portal.hdfgroup.org/display/HDF5/HDF5) + + """ + + HDF5 = "hdf5" + JSON = "json" + + def config_file_settings(settings: BaseSettings) -> Dict[str, Any]: """Configuration file settings source. @@ -280,7 +292,7 @@ class ServerConfig(BaseSettings): True, description="If True, the server will check whether the query parameters given in the request are correct.", ) - enabled_response_formats: Optional[List[str]] = Field( + enabled_response_formats: Optional[List[SupportedResponseFormats]] = Field( ["json"], description="""A list of the response formats that are supported by this server. Must include the "json" format.""", ) @@ -311,6 +323,9 @@ def use_real_mongo_override(cls, values): return values + def get_enabled_response_formats(self): + return [e.value for e in self.enabled_response_formats] + class Config: """ This is a pydantic model Config object that modifies the behaviour of diff --git a/optimade/server/entry_collections/entry_collections.py b/optimade/server/entry_collections/entry_collections.py index 0c6de2d50..2d7d4aa49 100644 --- a/optimade/server/entry_collections/entry_collections.py +++ b/optimade/server/entry_collections/entry_collections.py @@ -301,10 +301,10 @@ def handle_query_params( # response_format if ( getattr(params, "response_format", False) - and params.response_format not in CONFIG.enabled_response_formats + and params.response_format not in CONFIG.get_enabled_response_formats() ): raise BadRequest( - detail=f"Response format {params.response_format} is not supported, please use one of the supported response_formats: {','.join(CONFIG.enabled_response_formats)}" + detail=f"Response format {params.response_format} is not supported, please use one of the supported response_formats: {','.join(CONFIG.get_enabled_response_formats())}" ) # page_limit diff --git a/optimade/server/routers/info.py b/optimade/server/routers/info.py index aa43d603b..255a1cf82 100644 --- a/optimade/server/routers/info.py +++ b/optimade/server/routers/info.py @@ -25,6 +25,9 @@ def get_info(request: Request) -> InfoResponse: from optimade.models import BaseInfoResource, BaseInfoAttributes + entry_types_by_format_dict = {} + for _ in CONFIG.get_enabled_response_formats(): + entry_types_by_format_dict[_] = list(ENTRY_INFO_SCHEMAS) return InfoResponse( meta=meta_values( request.url, 1, 1, more_data_available=False, schema=CONFIG.schema_url @@ -40,9 +43,9 @@ def get_info(request: Request) -> InfoResponse: "version": __api_version__, } ], - formats=CONFIG.enabled_response_formats, - available_endpoints=["info", "links"] + list(ENTRY_INFO_SCHEMAS.keys()), - entry_types_by_format={"json": list(ENTRY_INFO_SCHEMAS.keys())}, + formats=CONFIG.get_enabled_response_formats(), + available_endpoints=["info", "links"] + list(ENTRY_INFO_SCHEMAS), + entry_types_by_format=entry_types_by_format_dict, is_index=False, ), ), @@ -72,8 +75,8 @@ def get_entry_info(request: Request, entry: str) -> EntryInfoResponse: schema, queryable_properties, entry_type=entry ) output_fields_by_format = {} - for outputformat in CONFIG.enabled_response_formats: - output_fields_by_format[outputformat] = list(properties.keys()) + for outputformat in CONFIG.get_enabled_response_formats(): + output_fields_by_format[outputformat] = list(properties) return EntryInfoResponse( meta=meta_values( diff --git a/optimade/server/routers/utils.py b/optimade/server/routers/utils.py index d4c4b51ae..72d59e9d4 100644 --- a/optimade/server/routers/utils.py +++ b/optimade/server/routers/utils.py @@ -286,6 +286,10 @@ def get_entries( media_type="application/x-hdf5", headers={"Content-Disposition": "attachment"}, ) + else: + raise BadRequest( + detail=f"The response_format {params.response_format} is not supported by this server. Use one of the supported formats: {','.join(CONFIG.get_enabled_response_formats())} instead " + ) def get_single_entry( @@ -342,3 +346,7 @@ def get_single_entry( media_type="application/x-hdf5", headers={"Content-Disposition": "attachment"}, ) + else: + raise BadRequest( + detail=f"The response_format {params.response_format} is not supported by this server. Use one of the supported formats: {','.join(CONFIG.get_enabled_response_formats())} instead " + ) diff --git a/setup.py b/setup.py index 8f97c176c..b9d9d820a 100644 --- a/setup.py +++ b/setup.py @@ -19,10 +19,15 @@ # Server minded elastic_deps = ["elasticsearch-dsl~=7.4,<8.0"] mongo_deps = ["pymongo>=3.12.1,<5", "mongomock~=4.1"] -server_deps = [ - "uvicorn~=0.18", - "pyyaml>=5.4,<7", # Keep at pyyaml 5.4 for aiida-core support -] + mongo_deps +hdf5_deps = ["h5py"] +server_deps = ( + [ + "uvicorn~=0.18", + "pyyaml>=5.4,<7", # Keep at pyyaml 5.4 for aiida-core support + ] + + mongo_deps + + hdf5_deps +) # Client minded @@ -107,7 +112,6 @@ "email_validator~=1.2", "requests~=2.28", "numpy~=1.23", - "h5py~=3.7", ], extras_require={ "all": all_deps, diff --git a/tests/server/query_params/test_response_format.py b/tests/server/query_params/test_response_format.py index 2c01fdf28..4a7e55dd8 100644 --- a/tests/server/query_params/test_response_format.py +++ b/tests/server/query_params/test_response_format.py @@ -66,7 +66,7 @@ def test_convert_to_hdf5_and_back(): def test_unsupported_response_format(check_error_response): request = '/structures?filter=chemical_formula_descriptive="Ac"&response_format=png' - error_detail = f"Response format png is not supported, please use one of the supported response_formats: {','.join(CONFIG.enabled_response_formats)}" + error_detail = f"Response format png is not supported, please use one of the supported response_formats: {','.join(CONFIG.get_enabled_response_formats())}" check_error_response( request, expected_status=400, From 50cacf0f9da1106a0e14949032673b06ef6d89ad Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Tue, 9 Aug 2022 13:48:19 +0200 Subject: [PATCH 18/25] Fixed test_response_format.py --- tests/server/query_params/test_response_format.py | 4 ++-- tests/test_config.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/server/query_params/test_response_format.py b/tests/server/query_params/test_response_format.py index 4a7e55dd8..cb8f9c9b0 100644 --- a/tests/server/query_params/test_response_format.py +++ b/tests/server/query_params/test_response_format.py @@ -18,14 +18,14 @@ def test_response_format(check_response): expected_ids = ["mpf_1"] check_response(request, expected_ids) - if "hdf5" in CONFIG.enabled_response_formats: + if "hdf5" in CONFIG.get_enabled_response_formats(): request = ( '/structures?filter=chemical_formula_descriptive="Ac"&response_format=hdf5' ) check_response(request, expected_ids) -if "hdf5" in CONFIG.enabled_response_formats: +if "hdf5" in CONFIG.get_enabled_response_formats(): def test_single_entry(check_response): """For single entry. Default value for `include` is 'references'""" diff --git a/tests/test_config.json b/tests/test_config.json index 59e8c5c29..a74d6f675 100644 --- a/tests/test_config.json +++ b/tests/test_config.json @@ -35,5 +35,5 @@ "chemsys": "nelements" } }, - "enabled_response_formats": ["json","hdf5"] + "enabled_response_formats": ["json", "hdf5"] } From 82f2b3118f5cdc5f9b88ac7387d13b09b2d8c1ff Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Tue, 9 Aug 2022 18:38:18 +0200 Subject: [PATCH 19/25] Added extra test values, and added support for handling nested lists of strings. --- optimade/adapters/hdf5.py | 19 +++++++++++++------ optimade/models/jsonapi.py | 9 ++++++++- .../query_params/test_response_format.py | 6 ++++++ 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/optimade/adapters/hdf5.py b/optimade/adapters/hdf5.py index 2943b6806..0d4e61b13 100644 --- a/optimade/adapters/hdf5.py +++ b/optimade/adapters/hdf5.py @@ -18,6 +18,8 @@ Unfortunately, h5py does not support storing objects with the numpy.object type. It is therefore not possible to directly store a list of dictionaries in a hdf5 file with h5py. As a workaround, the index of a value in a list is used as a dictionary key so a list can be stored as a dictionary if neccesary. + +It also assumes that all the elements of a list, tuple or numpy array are of the same type. """ @@ -79,7 +81,7 @@ def store_hdf5_dict( hdf5_file[group + "/" + key] = [] continue val_type = type(value[0]) - if val_type == dict: + if isinstance(value[0], dict): hdf5_file.create_group(group + "/" + key) store_hdf5_dict(hdf5_file, value, group + "/" + key) elif val_type.__module__ == np.__name__: @@ -93,18 +95,23 @@ def store_hdf5_dict( elif isinstance(value[0], (int, float)): hdf5_file[group + "/" + key] = np.asarray(value) elif isinstance(value[0], str): - hdf5_file[group + "/" + key] = value + hdf5_file[ + group + "/" + key + ] = value # here I can pass a list of strings to hdf5 which is stored as a numpy object. elif isinstance(value[0], (list, tuple)): list_type = get_recursive_type(value[0]) - if list_type in (int, float): + if list_type in ( + int, + float, + ): hdf5_file[group + "/" + key] = np.asarray(value) else: hdf5_file.create_group(group + "/" + key) store_hdf5_dict(hdf5_file, value, group + "/" + key) else: - raise ValueError( - f"The list with type :{val_type} cannot be converted to hdf5." - ) + hdf5_file.create_group(group + "/" + key) + store_hdf5_dict(hdf5_file, value, group + "/" + key) + elif isinstance(value, dict): hdf5_file.create_group(group + "/" + key) store_hdf5_dict(hdf5_file, value, group + "/" + key) diff --git a/optimade/models/jsonapi.py b/optimade/models/jsonapi.py index 9013d8be3..0dcd48173 100644 --- a/optimade/models/jsonapi.py +++ b/optimade/models/jsonapi.py @@ -320,6 +320,13 @@ class Resource(BaseResource): ) +def process_ndarray(arg): + if arg.dtype == object: + return arg.astype(str).tolist() + else: + return arg.tolist() + + class Response(BaseModel): """A top-level response""" @@ -371,5 +378,5 @@ class Config: numpy.int64: lambda v: int(v), numpy.float64: lambda v: float(v), numpy.bool_: lambda v: bool(v), - numpy.ndarray: lambda v: v.tolist(), + numpy.ndarray: process_ndarray, } diff --git a/tests/server/query_params/test_response_format.py b/tests/server/query_params/test_response_format.py index cb8f9c9b0..44f57ea13 100644 --- a/tests/server/query_params/test_response_format.py +++ b/tests/server/query_params/test_response_format.py @@ -40,6 +40,12 @@ def test_convert_to_hdf5_and_back(): "string": "str", "datetime": datetime.now(), "list": [[[2.3, 6.3], [8.6, 4.5]], [[8.9, 9.4], [5.6, 3.5]]], + "list_of_str": [ + ["string 1", "string 2"], + ["another string"], + "less nested string", + ], + "None_list": [None, None], "dict": {"a key": "a value", "another key": 7.33}, "tuple": (95, 63), "bool": False, From 42864cbb714321f48b33c827096a2f8bf766c8a0 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Wed, 10 Aug 2022 18:47:22 +0200 Subject: [PATCH 20/25] Added extra test to check if response_format is in the enabled_response_formats. --- optimade/server/routers/utils.py | 34 +++++++++++++++++--------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/optimade/server/routers/utils.py b/optimade/server/routers/utils.py index 72d59e9d4..9ca319832 100644 --- a/optimade/server/routers/utils.py +++ b/optimade/server/routers/utils.py @@ -278,14 +278,15 @@ def get_entries( ), included=included, ) - if params.response_format == "json": - return response_object - elif params.response_format == "hdf5": - return Response( - content=generate_hdf5_file_content(response_object), - media_type="application/x-hdf5", - headers={"Content-Disposition": "attachment"}, - ) + if params.response_format in CONFIG.get_enabled_response_formats(): + if params.response_format == "json": + return response_object + elif params.response_format == "hdf5": + return Response( + content=generate_hdf5_file_content(response_object), + media_type="application/x-hdf5", + headers={"Content-Disposition": "attachment"}, + ) else: raise BadRequest( detail=f"The response_format {params.response_format} is not supported by this server. Use one of the supported formats: {','.join(CONFIG.get_enabled_response_formats())} instead " @@ -338,14 +339,15 @@ def get_single_entry( ), included=included, ) - if params.response_format == "json": - return response_object - elif params.response_format == "hdf5": - return Response( - content=generate_hdf5_file_content(response_object), - media_type="application/x-hdf5", - headers={"Content-Disposition": "attachment"}, - ) + if params.response_format in CONFIG.get_enabled_response_formats(): + if params.response_format == "json": + return response_object + elif params.response_format == "hdf5": + return Response( + content=generate_hdf5_file_content(response_object), + media_type="application/x-hdf5", + headers={"Content-Disposition": "attachment"}, + ) else: raise BadRequest( detail=f"The response_format {params.response_format} is not supported by this server. Use one of the supported formats: {','.join(CONFIG.get_enabled_response_formats())} instead " From 30af05ac879c63e57ea30a3bf9e9cf4df3125e21 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Mon, 15 Aug 2022 15:53:32 +0200 Subject: [PATCH 21/25] Added filenames to the header. --- optimade/server/routers/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/optimade/server/routers/utils.py b/optimade/server/routers/utils.py index 9ca319832..e7e8a7d37 100644 --- a/optimade/server/routers/utils.py +++ b/optimade/server/routers/utils.py @@ -285,7 +285,9 @@ def get_entries( return Response( content=generate_hdf5_file_content(response_object), media_type="application/x-hdf5", - headers={"Content-Disposition": "attachment"}, + headers={ + "Content-disposition": f"attachment; filename={collection.collection.name}.hdf5" + }, ) else: raise BadRequest( @@ -346,7 +348,9 @@ def get_single_entry( return Response( content=generate_hdf5_file_content(response_object), media_type="application/x-hdf5", - headers={"Content-Disposition": "attachment"}, + headers={ + "Content-disposition": f"attachment; filename={entry_id}.hdf5" + }, ) else: raise BadRequest( From 47fa9add9bd8cf96512a71876862679e638e17a3 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Tue, 16 Aug 2022 15:23:29 +0200 Subject: [PATCH 22/25] Changed the way the collection name is determined for the file name of the hdf5 file. --- optimade/server/routers/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimade/server/routers/utils.py b/optimade/server/routers/utils.py index e7e8a7d37..68a5a8cd1 100644 --- a/optimade/server/routers/utils.py +++ b/optimade/server/routers/utils.py @@ -286,7 +286,7 @@ def get_entries( content=generate_hdf5_file_content(response_object), media_type="application/x-hdf5", headers={ - "Content-disposition": f"attachment; filename={collection.collection.name}.hdf5" + "Content-disposition": f"attachment; filename={results[0]['type']}.hdf5" }, ) else: From 4ada2849d4469d7ac426d96932b95519063fe3c6 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Thu, 15 Sep 2022 17:32:54 +0200 Subject: [PATCH 23/25] Update requirements.txt put requirements in alphabetical order --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 465f15f47..8edf03cfb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ elasticsearch-dsl==7.4.0 email_validator==1.2.1 +fastapi==0.82.0 h5py==3.7.0 lark==1.1.2 mongomock==4.1.2 numpy==1.23.0 -fastapi==0.82.0 pydantic==1.10.2 pymongo==4.2.0 pyyaml==5.4 From f1c309da693d24a66a91b85858031c11677f58d1 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Sun, 18 Sep 2022 20:56:53 +0200 Subject: [PATCH 24/25] updated version requirement numpy in requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8edf03cfb..3e824b4aa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ fastapi==0.82.0 h5py==3.7.0 lark==1.1.2 mongomock==4.1.2 -numpy==1.23.0 +numpy==1.23.2 pydantic==1.10.2 pymongo==4.2.0 pyyaml==5.4 From b32278f818beae8bdcde28b7338d571d424667b3 Mon Sep 17 00:00:00 2001 From: Johan Bergsma <29785380+JPBergsma@users.noreply.github.com> Date: Wed, 21 Sep 2022 18:02:21 +0200 Subject: [PATCH 25/25] Small fields are now stored as attributes rather than datasets. --- optimade/adapters/hdf5.py | 153 +++++++++++------- .../query_params/test_response_format.py | 6 + 2 files changed, 98 insertions(+), 61 deletions(-) diff --git a/optimade/adapters/hdf5.py b/optimade/adapters/hdf5.py index 0d4e61b13..91c8a9d2f 100644 --- a/optimade/adapters/hdf5.py +++ b/optimade/adapters/hdf5.py @@ -4,6 +4,7 @@ from datetime import datetime, timezone from optimade.models import EntryResponseMany, EntryResponseOne import h5py +from sys import getsizeof import numpy as np @@ -48,7 +49,7 @@ def generate_hdf5_file_content( def store_hdf5_dict( - hdf5_file: h5py._hl.files.File, iterable: Union[dict, list, tuple], group: str = "" + hdf5_file: h5py._hl.files.File, iterable: Union[dict, list, tuple], group: str = "/" ): """This function stores a python list, dictionary or tuple in a hdf5 file. the currently supported datatypes are str, int, float, list, dict, tuple, bool, AnyUrl, @@ -78,7 +79,7 @@ def store_hdf5_dict( value, (list, tuple) ): # For now, I assume that all values in the list have the same type. if len(value) < 1: # case empty list - hdf5_file[group + "/" + key] = [] + store_value_in_hdf5(key, value, group, hdf5_file) continue val_type = type(value[0]) if isinstance(value[0], dict): @@ -86,25 +87,21 @@ def store_hdf5_dict( store_hdf5_dict(hdf5_file, value, group + "/" + key) elif val_type.__module__ == np.__name__: try: - hdf5_file[group + "/" + key] = value - except (TypeError) as hdf5_error: + store_value_in_hdf5(key, value, group, hdf5_file) + except TypeError as hdf5_error: raise TypeError( "Unfortunatly more complex numpy types like object can not yet be stored in hdf5. Error from hdf5:" + hdf5_error ) elif isinstance(value[0], (int, float)): - hdf5_file[group + "/" + key] = np.asarray(value) + store_value_in_hdf5(key, np.asarray(value), group, hdf5_file) elif isinstance(value[0], str): - hdf5_file[ - group + "/" + key - ] = value # here I can pass a list of strings to hdf5 which is stored as a numpy object. + # Here I can pass a list of strings to hdf5 which is stored as a numpy object. + store_value_in_hdf5(key, value, group, hdf5_file) elif isinstance(value[0], (list, tuple)): list_type = get_recursive_type(value[0]) - if list_type in ( - int, - float, - ): - hdf5_file[group + "/" + key] = np.asarray(value) + if list_type in (int, float): + store_value_in_hdf5(key, np.asarray(value), group, hdf5_file) else: hdf5_file.create_group(group + "/" + key) store_hdf5_dict(hdf5_file, value, group + "/" + key) @@ -116,40 +113,55 @@ def store_hdf5_dict( hdf5_file.create_group(group + "/" + key) store_hdf5_dict(hdf5_file, value, group + "/" + key) elif isinstance(value, bool): - hdf5_file[group + "/" + key] = np.bool_(value) - elif isinstance( - value, AnyUrl - ): # This case had to be placed above the str case as AnyUrl inherits from the string class, but cannot be handled directly by h5py. - hdf5_file[group + "/" + key] = str(value) - elif isinstance( - value, - ( - int, - float, - str, - ), - ): - hdf5_file[group + "/" + key] = value + store_value_in_hdf5(key, np.bool_(value), group, hdf5_file) + elif isinstance(value, AnyUrl): + # This case had to be placed above the str case as AnyUrl inherits from the string class, but cannot be handled directly by h5py. + store_value_in_hdf5(key, str(value), group, hdf5_file) + elif isinstance(value, (int, float, str)): + store_value_in_hdf5(key, value, group, hdf5_file) + elif type(value).__module__ == np.__name__: try: - hdf5_file[group + "/" + key] = value - except (TypeError) as hdf5_error: + store_value_in_hdf5(key, value, group, hdf5_file) + except TypeError as hdf5_error: raise TypeError( - "Unfortunatly more complex numpy types like object can not yet be stored in hdf5. Error from hdf5:" - + hdf5_error + f"Unfortunatly more complex numpy types like object can not yet be stored in hdf5. Error from hdf5:{hdf5_error}" ) elif isinstance(value, datetime): - hdf5_file[group + "/" + key] = value.astimezone(timezone.utc).strftime( - "%Y-%m-%dT%H:%M:%SZ" + store_value_in_hdf5( + key, + value.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + group, + hdf5_file, ) elif value is None: - hdf5_file[group + "/" + key] = h5py.Empty("f") + store_value_in_hdf5(key, h5py.Empty("f"), group, hdf5_file) else: raise ValueError( f"Unable to store a value of type: {type(value)} in hdf5 format." ) +def store_value_in_hdf5(key, value, group, hdf5_file): + compression_level = 1 + if ( + getsizeof(value) < 4096 + ): # small properties can be sored as attributes the value of 4096 is rather arbitrary. The total of all the properties should however not exceed 64 kb. + if ( + group + ): # if a group is already present we can store small properties as attributes. (It seems that for each group /dataset a 64kb header is made causing the files to become very large.) + hdf5_file[group].attrs[key] = value + else: + hdf5_file[group + "/" + key] = value + else: + hdf5_file.create_dataset( + group + "/" + key, + data=value, + compression="gzip", + compression_opts=compression_level, + ) + + def get_recursive_type(obj: Any) -> type: """If obj is a list or tuple this function returns the type of the first object in the list/tuple that is not a list or tuple. If the list or tuple is empty it returns None. @@ -206,32 +218,51 @@ def generate_dict_from_hdf5( return_value = None for key, value in hdf5_file[group].items(): - if key.isdigit(): - if return_value is None: - return_value = [] - if isinstance(value, h5py._hl.group.Group): - return_value.append( - generate_dict_from_hdf5(hdf5_file, group=group + key + "/") - ) - elif isinstance(value[()], h5py._hl.base.Empty): - return_value.append(None) - elif isinstance(value[()], bytes): - return_value.append(value[()].decode()) - else: - return_value.append(value[()]) - - else: # Case dictionary - if return_value is None: - return_value = {} - if isinstance(value, h5py._hl.group.Group): - return_value[key] = generate_dict_from_hdf5( - hdf5_file, group=group + key + "/" - ) - elif isinstance(value[()], h5py._hl.base.Empty): - return_value[key] = None - elif isinstance(value[()], bytes): - return_value[key] = value[()].decode() - else: - return_value[key] = value[()] + return_value = inside_generate_dict_from_hdf5( + key, value, return_value, group, hdf5_file + ) + for key, value in hdf5_file[group].attrs.items(): + return_value = inside_generate_dict_from_hdf5( + key, value, return_value, group, hdf5_file + ) + return return_value + + +def inside_generate_dict_from_hdf5(key, value, return_value, group, hdf5_file): + if key.isdigit(): + if return_value is None: + return_value = [] + if isinstance(value, h5py._hl.group.Group): + return_value.append( + generate_dict_from_hdf5(hdf5_file, group=group + key + "/") + ) + elif isinstance(value, h5py._hl.base.Empty): + return_value.append(None) + elif isinstance(value, str): + return_value.append(value) + elif isinstance(value[()], h5py._hl.base.Empty): + return_value.append(None) + elif isinstance(value[()], bytes): + return_value.append(value[()].decode()) + else: + return_value.append(value[()]) + + else: # Case dictionary + if return_value is None: + return_value = {} + if isinstance(value, h5py._hl.group.Group): + return_value[key] = generate_dict_from_hdf5( + hdf5_file, group=group + key + "/" + ) + elif isinstance(value, h5py._hl.base.Empty): + return_value[key] = None + elif isinstance(value, str): + return_value[key] = value + elif isinstance(value[()], h5py._hl.base.Empty): + return_value[key] = None + elif isinstance(value[()], bytes): + return_value[key] = value[()].decode() + else: + return_value[key] = value[()] return return_value diff --git a/tests/server/query_params/test_response_format.py b/tests/server/query_params/test_response_format.py index 44f57ea13..a11cea5d8 100644 --- a/tests/server/query_params/test_response_format.py +++ b/tests/server/query_params/test_response_format.py @@ -56,6 +56,12 @@ def test_convert_to_hdf5_and_back(): "numpy_float32": numpy.float32(0.88153), "numpy_bool": numpy.bool_(True), "numpy_array": numpy.array([(1, 2), (3, 4)]), + "list_of_numpy_int": [numpy.int64(42), numpy.int64(16), numpy.int64(23)], + "list_of_numpy_array": [ + numpy.array([(1, 2), (3, 4)]), + numpy.array([(1.2, 2.3), (3.5, 4.1)]), + numpy.array([(1.8, 2.0), (3, 4)]), + ], } hdf5_file_content = generate_hdf5_file_content(test_dict)