Skip to content

Commit

Permalink
Make contents API scale (#3609)
Browse files Browse the repository at this point in the history
* Make `contents` API scale

PBENCH-1321

The `/datasets/{id}/contents` API includes into several unexpectedly expensive
steps:

1. Finding the tarball (by MD5 value) within the `ARCHIVE` tree using a `glob`
2. Fully discovering all tarballs within the controller directory
3. Unpacking the tarball into a cache directory using `tar`
4. Building a "map" of the contents of the unpacked tarball subtree

This PR includes mitigations for all but the `tar` unpack step:

1. Use the `server.tarball-path` metadata instead of searching the disk
2. Only discover the target tarball rather than the entire controller
3. Skip the "map" and evaluate the actual target path within the cache

Finding a tarball within our 30Tb `ARCHIVE` tree can take many minutes, while
identifying the controller directory from the tarball path takes a fraction of
a second.

Depending on the number of tarballs within a controller (some have many), full
controller discovery has been observed to take half a minute; while populating
only the target tarball takes a fraction of a second.

Building the map for a large tarball tree can take minutes, whereas discovery
of the actual relative file path within the cache runs at native (Python) file
system speeds.
  • Loading branch information
dbutenhof authored Feb 21, 2024
1 parent d6b8f26 commit c0946eb
Show file tree
Hide file tree
Showing 7 changed files with 558 additions and 340 deletions.
4 changes: 2 additions & 2 deletions lib/pbench/cli/server/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,12 +486,12 @@ def report(

try:
config = config_setup(context)
logger = get_pbench_logger("report-generator", config)
logger = get_pbench_logger("pbench-report-generator", config)
if any((all, archive, backup, cache)):
cache_m = CacheManager(config, logger)
verifier.status("starting discovery")
watcher.update("discovering cache")
cache_m.full_discovery()
cache_m.full_discovery(search=False)
watcher.update("processing reports")
verifier.status("finished discovery")
if all or archive:
Expand Down
2 changes: 1 addition & 1 deletion lib/pbench/cli/server/tree_manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def tree_manage(
logger = None
try:
config = config_setup(context)
logger = get_pbench_logger("cachemanager", config)
logger = get_pbench_logger("pbench-tree-manager", config)
cache_m = CacheManager(config, logger)
cache_m.full_discovery()
if display:
Expand Down
99 changes: 9 additions & 90 deletions lib/pbench/server/api/resources/datasets_contents.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from http import HTTPStatus
from pathlib import Path

from flask import current_app, jsonify
from flask.wrappers import Request, Response
Expand All @@ -22,8 +21,6 @@
BadDirpath,
CacheExtractBadPath,
CacheManager,
CacheObject,
CacheType,
TarballNotFound,
)
from pbench.server.database.models.datasets import Dataset
Expand Down Expand Up @@ -65,100 +62,22 @@ def _get(self, params: ApiParams, req: Request, context: ApiContext) -> Response

dataset: Dataset = params.uri["dataset"]
target = params.uri.get("target")
path = Path("." if target in ("/", None) else target)
path = "." if target in ("/", None) else target

prefix = current_app.server_config.rest_uri
origin = (
f"{self._get_uri_base(req).host}{prefix}/datasets/{dataset.resource_id}"
)

cache_m = CacheManager(self.config, current_app.logger)
try:
info = cache_m.find_entry(dataset.resource_id, path)
info = cache_m.get_contents(dataset.resource_id, path, origin)
except (BadDirpath, CacheExtractBadPath, TarballNotFound) as e:
raise APIAbort(HTTPStatus.NOT_FOUND, str(e))
except Exception as e:
raise APIInternalError(f"Cache find error: {str(e)!r}")

prefix = current_app.server_config.rest_uri
origin = (
f"{self._get_uri_base(req).host}{prefix}/datasets/{dataset.resource_id}"
)

details: CacheObject = info["details"]
if details.type is CacheType.DIRECTORY:
children = info["children"] if "children" in info else {}
dir_list = []
file_list = []

for c, value in children.items():
d: CacheObject = value["details"]
if d.type is CacheType.DIRECTORY:
dir_list.append(
{
"name": c,
"type": d.type.name,
"uri": f"{origin}/contents/{d.location}",
}
)
elif d.type is CacheType.SYMLINK:
if d.resolve_type is CacheType.DIRECTORY:
uri = f"{origin}/contents/{d.resolve_path}"
elif d.resolve_type is CacheType.FILE:
uri = f"{origin}/inventory/{d.resolve_path}"
else:
uri = f"{origin}/inventory/{d.location}"
file_list.append(
{
"name": c,
"type": d.type.name,
"link": str(d.resolve_path),
"link_type": d.resolve_type.name,
"uri": uri,
}
)
else:
r = {
"name": c,
"type": d.type.name,
"uri": f"{origin}/inventory/{d.location}",
}
if d.type is CacheType.FILE:
r["size"] = d.size
file_list.append(r)

dir_list.sort(key=lambda d: d["name"])
file_list.sort(key=lambda d: d["name"])

# Normalize because we want the "root" directory to be reported as
# "" rather than as Path's favored "."
loc = str(details.location)
name = details.name
if loc == ".":
loc = ""
name = ""
val = {
"name": name,
"type": details.type.name,
"directories": dir_list,
"files": file_list,
"uri": f"{origin}/contents/{loc}",
}
else:
access = "inventory"
link = str(details.location)
if details.type is CacheType.SYMLINK:
if details.resolve_type is CacheType.DIRECTORY:
access = "contents"
if details.resolve_type in (CacheType.FILE, CacheType.DIRECTORY):
link = str(details.resolve_path)
val = {
"name": details.name,
"type": details.type.name,
"uri": f"{origin}/{access}/{link}",
}
if details.type is CacheType.SYMLINK:
val["link"] = link
val["link_type"] = details.resolve_type.name
elif details.type is CacheType.FILE:
val["size"] = details.size

try:
return jsonify(val)
return jsonify(info)
except Exception as e:
raise APIInternalError(f"JSONIFY {val}: {str(e)!r}")
raise APIInternalError(f"JSONIFY {info}: {str(e)!r}")
Loading

0 comments on commit c0946eb

Please sign in to comment.