Release beta 20

HumanCellAtlas · Oct 8, 2019 · 5790e16 · 5790e16
2 parents 37b621f + 1536ce3
commit 5790e16
Show file tree

Hide file tree

Showing 7 changed files with 1,172 additions and 17 deletions.
diff --git a/.github/workflows/label.yml b/.github/workflows/label.yml
@@ -0,0 +1,10 @@
+name: Add `orange` label to new issues
+on: [issues]
+jobs:
+  label:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: HumanCellAtlas/azul-github-labeler-action@releases/v1
+      with:
+        repo-token: "${{secrets.GITHUB_TOKEN}}"
+        label: orange
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="hca-metadata-api",
-    version="1.0b19",
+    version="1.0b20",
     license='MIT',
     install_requires=[
         'dataclasses >= 0.6'
@@ -11,7 +11,8 @@
     extras_require={
         "dss": [
             'hca == 5.2.0',
-            'urllib3 >= 1.23'
+            'urllib3 >= 1.23',
+            'requests >= 2.19.1'
         ],
         "examples": [
             'jupyter >= 1.0.0'

diff --git a/src/humancellatlas/data/metadata/api.py b/src/humancellatlas/data/metadata/api.py
@@ -588,13 +588,15 @@ class File(LinkedEntity):
     from_processes: MutableMapping[UUID4, Process] = field(repr=False)
     to_processes: MutableMapping[UUID4, Process]
     manifest_entry: ManifestEntry
+    content_description: Set[str]
 
     def __init__(self, json: JSON, manifest: Mapping[str, ManifestEntry]):
         super().__init__(json)
         content = json.get('content', json)
         core = content['file_core']
         self.format = lookup(core, 'format', 'file_format')
         self.manifest_entry = manifest[core['file_name']]
+        self.content_description = {ontology_label(cd) for cd in core.get('content_description', [])}
         self.from_processes = {}
         self.to_processes = {}
 

diff --git a/src/humancellatlas/data/metadata/helpers/dss.py b/src/humancellatlas/data/metadata/helpers/dss.py
@@ -1,22 +1,37 @@
 from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
 import logging
-from typing import List, Optional, Tuple
+import os
+from typing import (
+    List,
+    Optional,
+    Tuple,
+    Mapping,
+    Any,
+)
+from unittest.mock import patch
 
 from hca.dss import DSSClient
+from requests import Session
 from urllib3 import Timeout
 
 from humancellatlas.data.metadata.api import JSON
 
 logger = logging.getLogger(__name__)
 
 
+@lru_cache(maxsize=1)
+def default_num_workers():
+    return os.cpu_count() * 5
+
+
 def download_bundle_metadata(client: DSSClient,
                              replica: str,
                              uuid: str,
                              version: Optional[str] = None,
                              directurls: bool = False,
                              presignedurls: bool = False,
-                             num_workers: Optional[int] = None) -> Tuple[str, List[JSON], JSON]:
+                             num_workers: Optional[int] = default_num_workers()) -> Tuple[str, List[JSON], JSON]:
     """
     Download the metadata for a given bundle from the HCA data store (DSS).
 
@@ -39,8 +54,8 @@ def download_bundle_metadata(client: DSSClient,
                           exclusive with the directurls parameter. Note this parameter, similar to the `directurls`,
                           is a temporary parameter, and it's not guaranteed to stay in this place in the future.
 
-    :param num_workers: The size of the thread pool to use for downloading metadata files in parallel. If None, the
-                        default pool size will be used, typically a small multiple of the number of cores on the system
+    :param num_workers: The size of the thread pool to use for downloading metadata files in parallel. If absent, the
+                        default pool size will be used, a small multiple of the number of cores on the system
                         executing this function. If 0, no thread pool will be used and all files will be downloaded
                         sequentially by the current thread.
 
@@ -110,14 +125,55 @@ def download_file(item):
     return bundle['version'], manifest, dict(metadata_files)
 
 
-def dss_client(deployment: str = 'prod') -> DSSClient:
+def dss_client(deployment: str = 'prod', num_workers: int = default_num_workers()) -> DSSClient:
     """
     Return a DSS client to DSS production or the specified DSS deployment.
 
-    :param deployment: The name of a DSS deployment like `dev`, `integration`, `staging` or `prod`.
+    :param deployment: The name of a DSS deployment like `dev`, `integration`,
+                       `staging` or `prod`.
+
+    :param num_workers: The number of threads that will be using this client.
+                        This value used to adequately size the HTTP connection
+                        pool which avoids discarding connections unnecessarily
+                        as indicated by the accompanying `Connection pool is
+                        full, discarding connection` warning.
     """
     deployment = "" if deployment == "prod" else deployment + "."
     swagger_url = f'https://dss.{deployment}data.humancellatlas.org/v1/swagger.json'
-    client = DSSClient(swagger_url=swagger_url)
+    client = _DSSClient(swagger_url=swagger_url,
+                        adapter_args=None if num_workers is None else dict(pool_maxsize=num_workers))
     client.timeout_policy = Timeout(connect=10, read=40)
     return client
+
+
+class _DSSClient(DSSClient):
+    """
+    A DSSClient with certain extensions and fixes.
+    """
+
+    def __init__(self, *args, adapter_args: Optional[Mapping[str, Any]] = None, **kwargs):
+        """
+        Pass `adapter_args=dict(pool_maxsize=num_threads)` in order to avoid the resource warnings.
+
+        :param args: positional arguments to pass to DSSClient constructor
+        :param adapter_args: optional keyword arguments to request's HTTPAdapter class
+        :param kwargs: keyword arguments to pass to DSSClient constructor
+        """
+        self._adapter_args = adapter_args  # yes, this must come first
+        super().__init__(*args, **kwargs)
+
+    def _set_retry_policy(self, session: Session):
+        if self._adapter_args is None:
+            super()._set_retry_policy(session)
+        else:
+            from requests.sessions import HTTPAdapter
+
+            class MyHTTPAdapter(HTTPAdapter):
+
+                # noinspection PyMethodParameters
+                def __init__(self_, *args, **kwargs):
+                    kwargs.update(self._adapter_args)
+                    super().__init__(*args, **kwargs)
+
+            with patch('hca.util.HTTPAdapter', new=MyHTTPAdapter):
+                super()._set_retry_policy(session)