[libshortfin] Add simple invocation test. (#170)

This test is not particularly inspired (and the API needs to be simplified) but it represents the first full system test in the repo. In order to run the test, it is downloading a mobilenet onnx file from the zoo, upgrading it, and compiling. In the future, I'd like to switch this to a simpler model like MNIST for basic functionality, but I had some issues getting that to work via ONNX import and punted. While a bit inefficient (it will fetch on each pytest run), this will keep things held together until we can do something more comprehensive. Note that my experience here prompted me to file iree-org/iree#18289, as this is way too much code and sharp edges to compile from ONNX (but it does work). Verifies numerics against a silly test image. Includes some fixes: * Reworked the system detect marker so that we only run system specific tests (like amdgpu) on opt-in via a `--system amdgpu` pytest arg. This refinement was prompted by an ASAN violation in the HIP runtime code which was tripping me up when enabled by default. Filed here: iree-org/iree#18449 * Fixed a bug revealed when writing the test where an exception thrown from main could trigger a use-after-free because we were clearing workers when shutting down (vs at destruction) when all objects owned at the system level need to have a lifetime no less than the system.
nod-ai · Sep 6, 2024 · 5a198e9 · 5a198e9
1 parent a038133
commit 5a198e9
Show file tree

Hide file tree

Showing 13 changed files with 193 additions and 14 deletions.
diff --git a/.github/workflows/ci_linux_x64-libshortfin.yml b/.github/workflows/ci_linux_x64-libshortfin.yml
@@ -85,6 +85,7 @@ jobs:
       # TODO: Switch to `pip install -r requirements.txt -e libshortfin/`.
       run: |
         pip install -r ${{ env.LIBSHORTFIN_DIR }}/requirements-tests.txt
+        pip install -r ${{ env.LIBSHORTFIN_DIR }}/requirements-iree-compiler.txt
         pip freeze
 
     - name: Build libshortfin (full)
@@ -107,7 +108,7 @@ jobs:
         cd ${{ env.LIBSHORTFIN_DIR }}/build
         ctest --timeout 30 --output-on-failure
         cd ${{ env.LIBSHORTFIN_DIR }}
-        pytest -s -v -m "not requires_amd_gpu"
+        pytest -s
 
     - name: Build libshortfin (host-only)
       run: |

diff --git a/.github/workflows/ci_linux_x64_asan-libshortfin.yml b/.github/workflows/ci_linux_x64_asan-libshortfin.yml
@@ -76,8 +76,10 @@ jobs:
     needs: [setup-python-asan]
     runs-on: ubuntu-24.04
     env:
-      # TODO(#151): Don't ignore ODR violations
-      ASAN_OPTIONS: detect_odr_violation=0
+      # We can't count on being leak free in general (i.e. pip, etc) so disable
+      # leak checker by default. Here we suppress any ASAN features needed to
+      # pass the build. Test configuration is done specially just for that step.
+      ASAN_OPTIONS: detect_leaks=0,detect_odr_violation=0
       LSAN_OPTIONS: suppressions=${{ github.workspace }}/libshortfin/build_tools/python_lsan_suppressions.txt
     steps:
     - name: Install dependencies
@@ -170,7 +172,10 @@ jobs:
 
     - name: Run pytest
       if: ${{ !cancelled() }}
+      env:
+        # TODO(#151): Don't ignore ODR violations
+        ASAN_OPTIONS: detect_odr_violation=0
       run: |
         eval "$(pyenv init -)"
         cd ${{ env.LIBSHORTFIN_DIR }}
-        pytest -m "not requires_amd_gpu" -s
+        pytest -s
diff --git a/libshortfin/README.md b/libshortfin/README.md
@@ -61,6 +61,20 @@ does mean that the C++ core of the library must always be built with the
 Python bindings to test the most behavior. Given the target of the project,
 this is not considered to be a significant issue.
 
+### Python tests
+
+Run platform independent tests only:
+
+```
+pytest tests/
+```
+
+Run tests including for a specific platform:
+
+```
+pytest tests/ --system amdgpu
+```
+
 # Production Library Building
 
 In order to build a production library, additional build steps are typically

diff --git a/libshortfin/build_tools/python_lsan_suppressions.txt b/libshortfin/build_tools/python_lsan_suppressions.txt
@@ -1,2 +1,8 @@
 leak:PyUnicode_New
+leak:_PyUnicodeWriter_PrepareInternal
 leak:_PyUnicodeWriter_Finish
+leak:numpy
+leak:_mlir_libs
+leak:google/_upb
+leak:import_find_and_load
+leak:ufunc
diff --git a/libshortfin/pyproject.toml b/libshortfin/pyproject.toml
@@ -12,9 +12,6 @@ addopts = [
     "-ra",
     "--import-mode=importlib",
 ]
-markers = [
-    "requires_amd_gpu: tests that require and AMD GPU (deselect with '-m \"not requires_amd_gpu\"')",
-]
 testpaths = [
     "tests",
 ]
diff --git a/libshortfin/requirements-tests.txt b/libshortfin/requirements-tests.txt
@@ -1,4 +1,5 @@
 pytest
 requests
 fastapi
+onnx
 uvicorn
diff --git a/libshortfin/src/shortfin/local/system.cc b/libshortfin/src/shortfin/local/system.cc
@@ -61,31 +61,28 @@ System::~System() {
 
 void System::Shutdown() {
   // Stop workers.
-  std::vector<std::unique_ptr<Worker>> local_workers;
   {
     iree::slim_mutex_lock_guard guard(lock_);
     if (!initialized_ || shutdown_) return;
     shutdown_ = true;
-    workers_by_name_.clear();
-    local_workers.swap(workers_);
   }
 
   // Worker drain and shutdown.
-  for (auto &worker : local_workers) {
+  for (auto &worker : workers_) {
     worker->Kill();
   }
-  for (auto &worker : local_workers) {
+  for (auto &worker : workers_) {
     if (worker->options().owned_thread) {
       worker->WaitForShutdown();
     }
   }
   blocking_executor_.Kill();
-  local_workers.clear();
 }
 
 std::shared_ptr<Scope> System::CreateScope(Worker &worker,
                                            std::span<Device *const> devices) {
   iree::slim_mutex_lock_guard guard(lock_);
+  AssertRunning();
   return std::make_shared<Scope>(shared_ptr(), worker, devices);
 }
 
@@ -102,6 +99,7 @@ void System::InitializeNodes(int node_count) {
 
 Queue &System::CreateQueue(Queue::Options options) {
   iree::slim_mutex_lock_guard guard(lock_);
+  AssertRunning();
   if (queues_by_name_.count(options.name) != 0) {
     throw std::invalid_argument(fmt::format(
         "Cannot create queue with duplicate name '{}'", options.name));
@@ -140,6 +138,7 @@ Worker &System::CreateWorker(Worker::Options options) {
   Worker *unowned_worker;
   {
     iree::slim_mutex_lock_guard guard(lock_);
+    AssertRunning();
     if (options.name == std::string_view("__init__")) {
       throw std::invalid_argument(
           "Cannot create worker '__init__' (reserved name)");
@@ -161,6 +160,7 @@ Worker &System::CreateWorker(Worker::Options options) {
 
 Worker &System::init_worker() {
   iree::slim_mutex_lock_guard guard(lock_);
+  AssertRunning();
   auto found_it = workers_by_name_.find("__init__");
   if (found_it != workers_by_name_.end()) {
     return *found_it->second;
@@ -207,6 +207,7 @@ void System::FinishInitialization() {
 
 int64_t System::AllocateProcess(detail::BaseProcess *p) {
   iree::slim_mutex_lock_guard guard(lock_);
+  AssertRunning();
   int pid = next_pid_++;
   processes_by_pid_[pid] = p;
   return pid;

diff --git a/libshortfin/src/shortfin/local/system.h b/libshortfin/src/shortfin/local/system.h
@@ -148,6 +148,13 @@ class SHORTFIN_API System : public std::enable_shared_from_this<System> {
           "initialization");
     }
   }
+  void AssertRunning() {
+    if (!initialized_ || shutdown_) {
+      throw std::logic_error(
+          "System manipulation methods can only be called when initialized and "
+          "not shutdown");
+    }
+  }
 
   // Allocates a process in the process table and returns its new pid.
   // This is done on process construction. Note that it acquires the

diff --git a/libshortfin/tests/__init__.py b/libshortfin/tests/__init__.py
diff --git a/libshortfin/tests/amdgpu_system_test.py b/libshortfin/tests/amdgpu_system_test.py
@@ -7,7 +7,7 @@
 import pytest
 
 
-@pytest.mark.requires_amd_gpu
+@pytest.mark.system("amdgpu")
 def test_create_amd_gpu_system():
     from _shortfin import lib as sfl
 

diff --git a/libshortfin/tests/conftest.py b/libshortfin/tests/conftest.py
@@ -0,0 +1,34 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--system",
+        action="store",
+        metavar="NAME",
+        nargs="*",
+        help="Enable tests for system name ('amdgpu', ...)",
+    )
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "system(name): mark test to run only on a named system"
+    )
+
+
+def pytest_runtest_setup(item):
+    required_system_names = [mark.args[0] for mark in item.iter_markers("system")]
+    if required_system_names:
+        available_system_names = item.config.getoption("--system") or []
+        if not all(name in available_system_names for name in required_system_names):
+            pytest.skip(
+                f"test requires system in {required_system_names!r} but has "
+                f"{available_system_names!r} (set with --system arg)"
+            )
diff --git a/libshortfin/tests/invocation/conftest.py b/libshortfin/tests/invocation/conftest.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import pytest
+import urllib.request
+
+
+def upgrade_onnx(original_path, converted_path):
+    import onnx
+
+    original_model = onnx.load_model(original_path)
+    converted_model = onnx.version_converter.convert_version(original_model, 17)
+    onnx.save(converted_model, converted_path)
+
+
+@pytest.fixture(scope="session")
+def mobilenet_onnx_path(tmp_path_factory):
+    try:
+        import onnx
+    except ModuleNotFoundError:
+        raise pytest.skip("onnx python package not available")
+    print("Downloading mobilenet.onnx")
+    parent_dir = tmp_path_factory.mktemp("mobilenet_onnx")
+    orig_onnx_path = parent_dir / "mobilenet_orig.onnx"
+    urllib.request.urlretrieve(
+        "https://github.com/onnx/models/raw/main/validated/vision/classification/mobilenet/model/mobilenetv2-12.onnx",
+        orig_onnx_path,
+    )
+    upgraded_onnx_path = parent_dir / "mobilenet.onnx"
+    upgrade_onnx(orig_onnx_path, upgraded_onnx_path)
+    return upgraded_onnx_path
+
+
+@pytest.fixture(scope="session")
+def mobilenet_compiled_cpu_path(mobilenet_onnx_path):
+    try:
+        import iree.compiler.tools as tools
+        import iree.compiler.tools.import_onnx.__main__ as import_onnx
+    except ModuleNotFoundError:
+        raise pytest.skip("iree.compiler packages not available")
+    print("Compiling mobilenet")
+    mlir_path = mobilenet_onnx_path.parent / "mobilenet.mlir"
+    vmfb_path = mobilenet_onnx_path.parent / "mobilenet_cpu.vmfb"
+    args = import_onnx.parse_arguments(["-o", str(mlir_path), str(mobilenet_onnx_path)])
+    import_onnx.main(args)
+    tools.compile_file(
+        str(mlir_path),
+        output_file=str(vmfb_path),
+        target_backends=["llvm-cpu"],
+        input_type="onnx",
+    )
+    return vmfb_path
diff --git a/libshortfin/tests/invocation/mobilenet_program_test.py b/libshortfin/tests/invocation/mobilenet_program_test.py
@@ -0,0 +1,58 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import array
+import functools
+import pytest
+
+import shortfin as sf
+import shortfin.array as sfnp
+
+
+@pytest.fixture
+def lsys():
+    sc = sf.host.CPUSystemBuilder()
+    lsys = sc.create_system()
+    yield lsys
+    lsys.shutdown()
+
+
+@pytest.fixture
+def scope(lsys):
+    return lsys.create_scope()
+
+
+@pytest.fixture
+def device(scope):
+    return scope.device(0)
+
+
+def test_invoke_mobilenet(lsys, scope, mobilenet_compiled_cpu_path):
+    device = scope.device(0)
+    dummy_data = array.array(
+        "f", ([0.2] * (224 * 224)) + ([0.4] * (224 * 224)) + ([-0.2] * (224 * 224))
+    )
+    program_module = lsys.load_module(mobilenet_compiled_cpu_path)
+    program = sf.Program([program_module], scope=scope)
+    main_function = program["module.torch-jit-export"]
+
+    async def main():
+        device_input = sfnp.device_array(device, [1, 3, 224, 224], sfnp.float32)
+        staging_input = device_input.for_transfer()
+        staging_input.storage.data = dummy_data
+        device_input.copy_from(staging_input)
+        (device_output,) = await main_function(device_input)
+        host_output = device_output.for_transfer()
+        host_output.copy_from(device_output)
+        await device
+        flat_output = array.array("f")
+        flat_output.frombytes(host_output.storage.data)
+        absmean = functools.reduce(
+            lambda x, y: x + abs(y) / len(flat_output), flat_output, 0.0
+        )
+        assert absmean == pytest.approx(5.01964943873882)
+
+    lsys.run(main())