Make contents API scale (#3609)

* Make `contents` API scale PBENCH-1321 The `/datasets/{id}/contents` API includes into several unexpectedly expensive steps: 1. Finding the tarball (by MD5 value) within the `ARCHIVE` tree using a `glob` 2. Fully discovering all tarballs within the controller directory 3. Unpacking the tarball into a cache directory using `tar` 4. Building a "map" of the contents of the unpacked tarball subtree This PR includes mitigations for all but the `tar` unpack step: 1. Use the `server.tarball-path` metadata instead of searching the disk 2. Only discover the target tarball rather than the entire controller 3. Skip the "map" and evaluate the actual target path within the cache Finding a tarball within our 30Tb `ARCHIVE` tree can take many minutes, while identifying the controller directory from the tarball path takes a fraction of a second. Depending on the number of tarballs within a controller (some have many), full controller discovery has been observed to take half a minute; while populating only the target tarball takes a fraction of a second. Building the map for a large tarball tree can take minutes, whereas discovery of the actual relative file path within the cache runs at native (Python) file system speeds.
distributed-system-analysis · Feb 21, 2024 · c0946eb · c0946eb
1 parent d6b8f26
commit c0946eb
Show file tree

Hide file tree

Showing 7 changed files with 558 additions and 340 deletions.
diff --git a/lib/pbench/cli/server/report.py b/lib/pbench/cli/server/report.py
@@ -486,12 +486,12 @@ def report(
 
     try:
         config = config_setup(context)
-        logger = get_pbench_logger("report-generator", config)
+        logger = get_pbench_logger("pbench-report-generator", config)
         if any((all, archive, backup, cache)):
             cache_m = CacheManager(config, logger)
             verifier.status("starting discovery")
             watcher.update("discovering cache")
-            cache_m.full_discovery()
+            cache_m.full_discovery(search=False)
             watcher.update("processing reports")
             verifier.status("finished discovery")
             if all or archive:

diff --git a/lib/pbench/cli/server/tree_manage.py b/lib/pbench/cli/server/tree_manage.py
@@ -80,7 +80,7 @@ def tree_manage(
     logger = None
     try:
         config = config_setup(context)
-        logger = get_pbench_logger("cachemanager", config)
+        logger = get_pbench_logger("pbench-tree-manager", config)
         cache_m = CacheManager(config, logger)
         cache_m.full_discovery()
         if display:

diff --git a/lib/pbench/server/api/resources/datasets_contents.py b/lib/pbench/server/api/resources/datasets_contents.py
@@ -1,5 +1,4 @@
 from http import HTTPStatus
-from pathlib import Path
 
 from flask import current_app, jsonify
 from flask.wrappers import Request, Response
@@ -22,8 +21,6 @@
     BadDirpath,
     CacheExtractBadPath,
     CacheManager,
-    CacheObject,
-    CacheType,
     TarballNotFound,
 )
 from pbench.server.database.models.datasets import Dataset
@@ -65,100 +62,22 @@ def _get(self, params: ApiParams, req: Request, context: ApiContext) -> Response
 
         dataset: Dataset = params.uri["dataset"]
         target = params.uri.get("target")
-        path = Path("." if target in ("/", None) else target)
+        path = "." if target in ("/", None) else target
+
+        prefix = current_app.server_config.rest_uri
+        origin = (
+            f"{self._get_uri_base(req).host}{prefix}/datasets/{dataset.resource_id}"
+        )
 
         cache_m = CacheManager(self.config, current_app.logger)
         try:
-            info = cache_m.find_entry(dataset.resource_id, path)
+            info = cache_m.get_contents(dataset.resource_id, path, origin)
         except (BadDirpath, CacheExtractBadPath, TarballNotFound) as e:
             raise APIAbort(HTTPStatus.NOT_FOUND, str(e))
         except Exception as e:
             raise APIInternalError(f"Cache find error: {str(e)!r}")
 
-        prefix = current_app.server_config.rest_uri
-        origin = (
-            f"{self._get_uri_base(req).host}{prefix}/datasets/{dataset.resource_id}"
-        )
-
-        details: CacheObject = info["details"]
-        if details.type is CacheType.DIRECTORY:
-            children = info["children"] if "children" in info else {}
-            dir_list = []
-            file_list = []
-
-            for c, value in children.items():
-                d: CacheObject = value["details"]
-                if d.type is CacheType.DIRECTORY:
-                    dir_list.append(
-                        {
-                            "name": c,
-                            "type": d.type.name,
-                            "uri": f"{origin}/contents/{d.location}",
-                        }
-                    )
-                elif d.type is CacheType.SYMLINK:
-                    if d.resolve_type is CacheType.DIRECTORY:
-                        uri = f"{origin}/contents/{d.resolve_path}"
-                    elif d.resolve_type is CacheType.FILE:
-                        uri = f"{origin}/inventory/{d.resolve_path}"
-                    else:
-                        uri = f"{origin}/inventory/{d.location}"
-                    file_list.append(
-                        {
-                            "name": c,
-                            "type": d.type.name,
-                            "link": str(d.resolve_path),
-                            "link_type": d.resolve_type.name,
-                            "uri": uri,
-                        }
-                    )
-                else:
-                    r = {
-                        "name": c,
-                        "type": d.type.name,
-                        "uri": f"{origin}/inventory/{d.location}",
-                    }
-                    if d.type is CacheType.FILE:
-                        r["size"] = d.size
-                    file_list.append(r)
-
-            dir_list.sort(key=lambda d: d["name"])
-            file_list.sort(key=lambda d: d["name"])
-
-            # Normalize because we want the "root" directory to be reported as
-            # "" rather than as Path's favored "."
-            loc = str(details.location)
-            name = details.name
-            if loc == ".":
-                loc = ""
-                name = ""
-            val = {
-                "name": name,
-                "type": details.type.name,
-                "directories": dir_list,
-                "files": file_list,
-                "uri": f"{origin}/contents/{loc}",
-            }
-        else:
-            access = "inventory"
-            link = str(details.location)
-            if details.type is CacheType.SYMLINK:
-                if details.resolve_type is CacheType.DIRECTORY:
-                    access = "contents"
-                if details.resolve_type in (CacheType.FILE, CacheType.DIRECTORY):
-                    link = str(details.resolve_path)
-            val = {
-                "name": details.name,
-                "type": details.type.name,
-                "uri": f"{origin}/{access}/{link}",
-            }
-            if details.type is CacheType.SYMLINK:
-                val["link"] = link
-                val["link_type"] = details.resolve_type.name
-            elif details.type is CacheType.FILE:
-                val["size"] = details.size
-
         try:
-            return jsonify(val)
+            return jsonify(info)
         except Exception as e:
-            raise APIInternalError(f"JSONIFY {val}: {str(e)!r}")
+            raise APIInternalError(f"JSONIFY {info}: {str(e)!r}")