Materials-Consortia · JPBergsma · Jul 28, 2022 · Jul 29, 2022 · Jul 29, 2022 · Jul 29, 2022
@@ -0,0 +1,3 @@
+# hdf5
+
+::: optimade.adapters.hdf5
@@ -0,0 +1,230 @@
+from io import BytesIO
+from typing import Union, Any
+from pydantic import AnyUrl
+from datetime import datetime, timezone
+from optimade.models import EntryResponseMany, EntryResponseOne
+import h5py
+import numpy as np
+
+
+"""This adaptor class can be used to generate a hdf5 response instead of a json response and to convert the hdf5 response back into an python dictionary.
+It can handle numeric data in a binary format compatible with numpy.
+It is therefore more efficient than the JSON format at returning large amounts of numeric data.
+It however also has more overhead resulting in a larger response for entries with little numeric data.
+To enable support for your server the parameter "enabled_response_formats" can be specified in the config file.
+It is a list of the supported response_formats. To support the hdf5 return format it should be set to: ["json", "hdf5"]
+(support for the JSON format is mandatory)
+
+Unfortunately, h5py does not support storing objects with the numpy.object type.
+It is therefore not possible to directly store a list of dictionaries in a hdf5 file with h5py.
+As a workaround, the index of a value in a list is used as a dictionary key so a list can be stored as a dictionary if neccesary.
+"""
+
+
+def generate_hdf5_file_content(
+    response_object: Union[EntryResponseMany, EntryResponseOne, dict, list, tuple]
+) -> bytes:
+    """This function generates the content of a hdf5 file from an EntryResponse object.
+    It should also be able to handle python dictionaries lists and tuples.
+
+    Parameters:
+        response_object: an OPTIMADE response object. This can be of any OPTIMADE entry type, such as structure, reference etc.
+
+    Returns:
+        A binary object containing the contents of the hdf5 file.
+    """
+
+    temp_file = BytesIO()
+    hdf5_file = h5py.File(temp_file, "w")
+    if isinstance(response_object, (EntryResponseMany, EntryResponseOne)):
+        response_object = response_object.dict(exclude_unset=True)
+    store_hdf5_dict(hdf5_file, response_object)
+    hdf5_file.close()
+    file_content = temp_file.getvalue()
+    temp_file.close()
+    return file_content
+
+
+def store_hdf5_dict(
+    hdf5_file: h5py._hl.files.File, iterable: Union[dict, list, tuple], group: str = ""
+):
+    """This function stores a python list, dictionary or tuple in a hdf5 file.
+    the currently supported datatypes are str, int, float, list, dict, tuple, bool, AnyUrl,
+    None ,datetime or any numpy type or numpy array.
+
+    Unfortunately, h5py does not support storing objects with the numpy.object type.
+    It is therefore not possible to directly store a list of dictionaries in a hdf5 file with h5py.
+    As a workaround, the index of a value in a list is used as a dictionary key so a list can be stored as a dictionary if neccesary.
+
+    Parameters:
+        hdf5_file: An hdf5 file like object.
+        iterable: The object to be stored in the hdf5 file.
+        group: This indicates to group in the hdf5 file the list, tuple or dictionary should be added.
+
+    Raises:
+        TypeError: If this function encounters an object with a type that it cannot convert to the hdf5 format
+                    a ValueError is raised.
+    """
+    if isinstance(iterable, (list, tuple)):
+        iterable = enumerate(iterable)
+    elif isinstance(iterable, dict):
+        iterable = iterable.items()
+    for x in iterable:
+        key = str(x[0])
+        value = x[1]
+        if isinstance(
+            value, (list, tuple)
+        ):  # For now, I assume that all values in the list have the same type.
+            if len(value) < 1:  # case empty list
+                hdf5_file[group + "/" + key] = []
+                continue
+            val_type = type(value[0])
+            if val_type == dict:
+                hdf5_file.create_group(group + "/" + key)
+                store_hdf5_dict(hdf5_file, value, group + "/" + key)
+            elif val_type.__module__ == np.__name__:
+                try:
+                    hdf5_file[group + "/" + key] = value
+                except (TypeError) as hdf5_error:
+                    raise TypeError(
+                        "Unfortunatly more complex numpy types like object can not yet be stored in hdf5. Error from hdf5:"
+                        + hdf5_error
+                    )
+            elif isinstance(value[0], (int, float)):
+                hdf5_file[group + "/" + key] = np.asarray(value)
+            elif isinstance(value[0], str):
+                hdf5_file[group + "/" + key] = value
+            elif isinstance(value[0], (list, tuple)):
+                list_type = get_recursive_type(value[0])
+                if list_type in (int, float):
+                    hdf5_file[group + "/" + key] = np.asarray(value)
+                else:
+                    hdf5_file.create_group(group + "/" + key)
+                    store_hdf5_dict(hdf5_file, value, group + "/" + key)
+            else:
+                raise ValueError(
+                    f"The list with type :{val_type} cannot be converted to hdf5."
+                )
+        elif isinstance(value, dict):
+            hdf5_file.create_group(group + "/" + key)
+            store_hdf5_dict(hdf5_file, value, group + "/" + key)
+        elif isinstance(value, bool):
+            hdf5_file[group + "/" + key] = np.bool_(value)
+        elif isinstance(
+            value, AnyUrl
+        ):  # This case had to be placed above the str case as AnyUrl inherits from the string class, but cannot be handled directly by h5py.
+            hdf5_file[group + "/" + key] = str(value)
+        elif isinstance(
+            value,
+            (
+                int,
+                float,
+                str,
+            ),
+        ):
+            hdf5_file[group + "/" + key] = value
+        elif type(value).__module__ == np.__name__:
+            try:
+                hdf5_file[group + "/" + key] = value
+            except (TypeError) as hdf5_error:
+                raise TypeError(
+                    "Unfortunatly more complex numpy types like object can not yet be stored in hdf5. Error from hdf5:"
+                    + hdf5_error
+                )
+        elif isinstance(value, datetime):
+            hdf5_file[group + "/" + key] = value.astimezone(timezone.utc).strftime(
+                "%Y-%m-%dT%H:%M:%SZ"
+            )
+        elif value is None:
+            hdf5_file[group + "/" + key] = h5py.Empty("f")
+        else:
+            raise ValueError(
+                f"Unable to store a value of type: {type(value)} in hdf5 format."
+            )
+
+
+def get_recursive_type(obj: Any) -> type:
+    """If obj is a list or tuple this function returns the type of the first object in the list/tuple that is not a list
+    or tuple. If the list or tuple is empty it returns None.
+    Finally if the object is not a list or tuple it returns the type of the object.
+
+    Parameters:
+        obj: any python object
+
+    Returns:
+        The type of the objects that the object contains or the type of the object itself when it does not contain other objects."""
+
+    if isinstance(obj, (list, tuple)):
+        if len(obj) == 0:
+            return None
+        else:
+            if isinstance(obj[0], (list, tuple)):
+                return get_recursive_type(obj[0])
+            else:
+                return type(obj[0])
+    return type(obj)
+
+
+def generate_response_from_hdf5(hdf5_content: bytes) -> dict:
+    """Generates a response_dict from a HDF5 file like object.
+    It is similar to the response_dict generated from the JSON response, except that the numerical data will have numpy
+    types.
+
+    Parameters:
+         hdf5_content: the content of a hdf5 file.
+
+    Returns:
+         A dictionary containing the data of the hdf5 file."""
+
+    temp_file = BytesIO(hdf5_content)
+    hdf5_file = h5py.File(temp_file, "r")
+    response_dict = generate_dict_from_hdf5(hdf5_file)
+    return response_dict
+
+
+def generate_dict_from_hdf5(
+    hdf5_file: h5py._hl.files.File, group: str = "/"
+) -> Union[dict, list]:
+    """This function returns the content of a hdf5 group.
+    Because of the workaround described under the store_hdf5_dict function, groups which have numbers as keys will be turned to lists(No guartee that the order is the same as in th eoriginal list).
+    Otherwise, the group will be turned into a dict.
+
+    Parameters:
+        hdf5_file: An HDF5_object containing the data that should be converted to a dictionary or list.
+        group: The hdf5 group for which the dictionary should be created. The default is "/" which will return all the data in the hdf5_object
+
+    Returns:
+        A dict or list containing the content of the hdf5 group.
+    """
+
+    return_value = None
+    for key, value in hdf5_file[group].items():
+        if key.isdigit():
+            if return_value is None:
+                return_value = []
+            if isinstance(value, h5py._hl.group.Group):
+                return_value.append(
+                    generate_dict_from_hdf5(hdf5_file, group=group + key + "/")
+                )
+            elif isinstance(value[()], h5py._hl.base.Empty):
+                return_value.append(None)
+            elif isinstance(value[()], bytes):
+                return_value.append(value[()].decode())
+            else:
+                return_value.append(value[()])
+
+        else:  # Case dictionary
+            if return_value is None:
+                return_value = {}
+            if isinstance(value, h5py._hl.group.Group):
+                return_value[key] = generate_dict_from_hdf5(
+                    hdf5_file, group=group + key + "/"
+                )
+            elif isinstance(value[()], h5py._hl.base.Empty):
+                return_value[key] = None
+            elif isinstance(value[()], bytes):
+                return_value[key] = value[()].decode()
+            else:
+                return_value[key] = value[()]
+
+    return return_value
@@ -8,6 +8,7 @@
     parse_obj_as,
     root_validator,
 )
+import numpy
 from optimade.models.utils import StrictField
 
 
@@ -365,4 +366,10 @@ class Config:
             datetime: lambda v: v.astimezone(timezone.utc).strftime(
                 "%Y-%m-%dT%H:%M:%SZ"
             ),
+            numpy.int32: lambda v: int(v),
+            numpy.float32: lambda v: float(v),
+            numpy.int64: lambda v: int(v),
+            numpy.float64: lambda v: float(v),
+            numpy.bool_: lambda v: bool(v),
+            numpy.ndarray: lambda v: v.tolist(),
         }
@@ -280,6 +280,10 @@ class ServerConfig(BaseSettings):
         True,
         description="If True, the server will check whether the query parameters given in the request are correct.",
     )
+    enabled_response_formats: Optional[List[str]] = Field(
+        ["json"],
+        description="""A list of the response formats that are supported by this server. Must include the "json" format.""",
+    )
 
     @validator("implementation", pre=True)
     def set_implementation_version(cls, v):

@@ -301,10 +301,10 @@ def handle_query_params(
         # response_format
         if (
             getattr(params, "response_format", False)
-            and params.response_format != "json"
+            and params.response_format not in CONFIG.enabled_response_formats
         ):
             raise BadRequest(
-                detail=f"Response format {params.response_format} is not supported, please use response_format='json'"
+                detail=f"Response format {params.response_format} is not supported, please use one of the supported response_formats: {','.join(CONFIG.enabled_response_formats)}"
             )
 
         # page_limit

@@ -447,7 +447,8 @@ async def dispatch(self, request: Request, call_next):
             if not isinstance(chunk, bytes):
                 chunk = chunk.encode(charset)
             body += chunk
-        body = body.decode(charset)
+        if response.raw_headers[1][1] == b"application/vnd.api+json":
+            body = body.decode(charset)
 
         if self._warnings:
             response = json.loads(body)

@@ -40,7 +40,7 @@ def get_info(request: Request) -> InfoResponse:
                         "version": __api_version__,
                     }
                 ],
-                formats=["json"],
+                formats=CONFIG.enabled_response_formats,
                 available_endpoints=["info", "links"] + list(ENTRY_INFO_SCHEMAS.keys()),
                 entry_types_by_format={"json": list(ENTRY_INFO_SCHEMAS.keys())},
                 is_index=False,
@@ -71,8 +71,9 @@ def get_entry_info(request: Request, entry: str) -> EntryInfoResponse:
     properties = retrieve_queryable_properties(
         schema, queryable_properties, entry_type=entry
     )
-
-    output_fields_by_format = {"json": list(properties.keys())}
+    output_fields_by_format = {}
+    for outputformat in CONFIG.enabled_response_formats:
+        output_fields_by_format[outputformat] = list(properties.keys())
-        output_fields_by_format[outputformat] = list(properties.keys())
+        output_fields_by_format[outputformat] = list(properties)
-        output_fields_by_format[outputformat] = list(properties.keys())
+        output_fields_by_format[outputformat] = list(properties)
 
     return EntryInfoResponse(
         meta=meta_values(

@@ -4,7 +4,7 @@
 from datetime import datetime
 from typing import Any, Dict, List, Set, Union
 
-from fastapi import Request
+from fastapi import Request, Response
 from fastapi.responses import JSONResponse
 from starlette.datastructures import URL as StarletteURL
 
@@ -22,6 +22,7 @@
 from optimade.server.exceptions import BadRequest, InternalServerError
 from optimade.server.query_params import EntryListingQueryParams, SingleEntryQueryParams
 from optimade.utils import mongo_id_for_database, get_providers, PROVIDER_LIST_URLS
+from optimade.adapters.hdf5 import generate_hdf5_file_content
 
 __all__ = (
     "BASE_URL_PREFIXES",
@@ -265,7 +266,7 @@ def get_entries(
     if fields or include_fields:
         results = handle_response_fields(results, fields, include_fields)
 
-    return response(
+    response_object = response(
         links=links,
         data=results,
         meta=meta_values(
@@ -277,6 +278,14 @@ def get_entries(
         ),
         included=included,
     )
+    if params.response_format == "json":
+        return response_object
+    elif params.response_format == "hdf5":
+        return Response(
+            content=generate_hdf5_file_content(response_object),
+            media_type="application/x-hdf5",
+            headers={"Content-Disposition": "attachment"},
+        )
 
 
 def get_single_entry(
@@ -313,7 +322,7 @@ def get_single_entry(
     if fields or include_fields and results is not None:
         results = handle_response_fields(results, fields, include_fields)[0]
 
-    return response(
+    response_object = response(
         links=links,
         data=results,
         meta=meta_values(
@@ -325,3 +334,11 @@ def get_single_entry(
         ),
         included=included,
     )
+    if params.response_format == "json":
+        return response_object
+    elif params.response_format == "hdf5":
+        return Response(
+            content=generate_hdf5_file_content(response_object),
+            media_type="application/x-hdf5",
+            headers={"Content-Disposition": "attachment"},
+        )
@@ -1,8 +1,10 @@
 elasticsearch-dsl==7.4.0
 email_validator==1.2.1
 fastapi==0.79.0
+h5py==3.7.0
 lark==1.1.2
 mongomock==4.1.2
+numpy==1.23.0
 pydantic==1.9.1
 pymongo==4.1.1
 pyyaml==5.4