Merge pkgci_test_ jobs using matrices. (iree-org#17512)

Progress on iree-org#17136. Similar to iree-org#17509, this folds the five `regression_test_` jobs in pkgci.yml into two matrices, `test_onnx` and `test_models`, with most steps shared. The "model" tests are still verbose to configure here as we have multiple variants on some backends. Depending on how much we want to unify or diverge we could continue to refactor further. ci-exactly: build_packages,regression_test
nod-ai · May 29, 2024 · b6b1df6 · b6b1df6
1 parent 2c59505
commit b6b1df6
Show file tree

Hide file tree

Showing 8 changed files with 128 additions and 557 deletions.
diff --git a/.github/workflows/pkgci.yml b/.github/workflows/pkgci.yml
@@ -37,35 +37,11 @@ jobs:
     with:
       package_version: 0.dev1
 
-  regression_test_cpu:
-    name: Regression Test CPU
+  regression_test:
+    name: Regression Test
     needs: [setup, build_packages]
-    if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test_cpu')
-    uses: ./.github/workflows/pkgci_regression_test_cpu.yml
-
-  regression_test_amdgpu_vulkan:
-    name: Regression Test AMDGPU-Vulkan
-    needs: [setup, build_packages]
-    if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test_amdgpu_vulkan')
-    uses: ./.github/workflows/pkgci_regression_test_amdgpu_vulkan.yml
-
-  regression_test_amdgpu_rocm:
-    name: Regression Test AMDGPU-ROCm
-    needs: [setup, build_packages]
-    if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test_amdgpu_rocm')
-    uses: ./.github/workflows/pkgci_regression_test_amdgpu_rocm.yml
-
-  regression_test_nvidiagpu_vulkan:
-    name: Regression Test NVIDIAGPU-Vulkan
-    needs: [setup, build_packages]
-    if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test_nvidiagpu_vulkan')
-    uses: ./.github/workflows/pkgci_regression_test_nvidiagpu_vulkan.yml
-
-  regression_test_nvidiagpu_cuda:
-    name: Regression Test NVIDIAGPU-CUDA
-    needs: [setup, build_packages]
-    if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test_nvidiagpu_cuda')
-    uses: ./.github/workflows/pkgci_regression_test_nvidiagpu_cuda.yml
+    if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test')
+    uses: ./.github/workflows/pkgci_regression_test.yml
 
   test_tensorflow_cpu:
     name: Test TensorFlow CPU

diff --git a/...ows/pkgci_regression_test_amdgpu_rocm.yml → .github/workflows/pkgci_regression_test.yml b/...ows/pkgci_regression_test_amdgpu_rocm.yml → .github/workflows/pkgci_regression_test.yml
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-name: PkgCI Regression Test (AMDGPU ROCm)
+name: PkgCI Regression Test
 on:
   workflow_call:
     inputs:
@@ -18,12 +18,54 @@ on:
         default: ""
 
 jobs:
-  linux_x86_64:
-    name: Linux (x86_64)
-    runs-on: nodai-amdgpu-w7900-x86-64
+  test_onnx:
+    name: "test_onnx :: ${{ matrix.name }}"
+    runs-on: ${{ matrix.runs-on }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          # CPU
+          - name: cpu_llvm_sync
+            config-file: onnx_cpu_llvm_sync.json
+            numprocesses: auto
+            runs-on: ubuntu-20.04
+
+          # AMD GPU
+          - name: amdgpu_rocm_rdna3
+            numprocesses: 1
+            config-file: onnx_gpu_rocm_rdna3.json
+            runs-on: nodai-amdgpu-w7900-x86-64
+          - name: amdgpu_vulkan
+            numprocesses: 4
+            config-file: onnx_gpu_vulkan.json
+            runs-on: nodai-amdgpu-w7900-x86-64
+
+          # NVIDIA GPU
+          - name: nvidiagpu_cuda
+            config-file: onnx_gpu_cuda.json
+            numprocesses: 4
+            runs-on:
+              - self-hosted # must come first
+              - runner-group=${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}
+              - environment=prod
+              - gpu  # TODO(scotttodd): qualify further with vendor/model
+              - os-family=Linux
+          - name: nvidiagpu_vulkan
+            config-file: onnx_gpu_vulkan.json
+            numprocesses: 4
+            runs-on:
+              - self-hosted # must come first
+              - runner-group=${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}
+              - environment=prod
+              - gpu  # TODO(scotttodd): qualify further with vendor/model
+              - os-family=Linux
     env:
       PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
       IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
+      CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.config-file }}
+      NUMPROCESSES: ${{ matrix.numprocesses }}
+      LOG_FILE_PATH: /tmp/iree_tests_onnx_${{ matrix.name }}_logs.json
       VENV_DIR: ${{ github.workspace }}/venv
     steps:
       - name: Checking out IREE repository
@@ -44,63 +86,88 @@ jobs:
             --artifact-path=${PACKAGE_DOWNLOAD_DIR} \
             --fetch-gh-workflow=${{ inputs.artifact_run_id }}
 
-      # TODO(#17344): regenerate .mlirbc files
-      # # In-tree tests
-      # - name: Run experimental/regression_suite tests
-      #   run: |
-      #     source ${VENV_DIR}/bin/activate
-      #     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm/hip/lib
-      #     pytest \
-      #       -rA -s -m "plat_rdna3_rocm and presubmit" \
-      #       experimental/regression_suite
-
-      # Out of tree tests
-      # TODO(scotttodd): Increase parallelism when supported by the HIP HAL
-      #   driver and/or test runner machine.
-      - name: Checking out external TestSuite repository
+      - name: Check out external TestSuite repository
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
         with:
           repository: nod-ai/SHARK-TestSuite
           ref: c9b3337e1f754c83d178568be1339aaef5f08045
           path: SHARK-TestSuite
           submodules: false
-      - name: Installing external TestSuite Python requirements
+          lfs: false
+      - name: Install external TestSuite Python requirements
         run: |
           source ${VENV_DIR}/bin/activate
           python -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt
+
       - name: Run external tests - ONNX test suite
         run: |
           source ${VENV_DIR}/bin/activate
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm/hip/lib
           pytest SHARK-TestSuite/iree_tests/onnx/ \
-              -rpfE --timeout=30 --durations=20 \
-              --config-files=build_tools/pkgci/external_test_suite/onnx_gpu_rocm_rdna3.json \
+              -rpfE \
+              --numprocesses ${NUMPROCESSES} \
+              --timeout=30 \
+              --durations=20 \
               --no-skip-tests-missing-files \
-              --report-log=/tmp/iree_tests_onnx_gpu_rocm_rdna3_logs.json
+              --config-files=${CONFIG_FILE_PATH} \
+              --report-log=${LOG_FILE_PATH}
       - name: "Updating config file with latest XFAIL lists"
         if: failure()
         run: |
           source ${VENV_DIR}/bin/activate
           python SHARK-TestSuite/iree_tests/update_config_xfails.py \
-            --log-file=/tmp/iree_tests_onnx_gpu_rocm_rdna3_logs.json \
-            --config-file=build_tools/pkgci/external_test_suite/onnx_gpu_rocm_rdna3.json
-          cat build_tools/pkgci/external_test_suite/onnx_gpu_rocm_rdna3.json
+            --log-file=${LOG_FILE_PATH} \
+            --config-file=${CONFIG_FILE_PATH}
+          cat ${CONFIG_FILE_PATH}
       - name: "Uploading new config file"
         if: failure()
         uses: actions/upload-artifact@v4
         with:
-          name: "onnx_gpu_rocm_rdna3.json"
-          path: "build_tools/pkgci/external_test_suite/onnx_gpu_rocm_rdna3.json"
+          name: "${{ matrix.config-file }}"
+          path: "${CONFIG_FILE_PATH}"
 
-  linux_x86_64_rocm_models:
-    name: MI250 - Models
-    runs-on: nodai-amdgpu-mi250-x86-64
+  test_models:
+    name: "test_models :: ${{ matrix.name }}"
+    runs-on: ${{ matrix.runs-on }}
+    strategy:
+      fail-fast: false
+
+      # Note: these jobs should use persistent runners with local caches.
+      # Downloading test files (50GB+) without a cache can take 20+ minutes.
+      matrix:
+        include:
+          # CPU
+          - name: cpu_llvm_task
+            models-config-file: pytorch_models_cpu_llvm_task.json
+            sdxl-config-file: sdxl_scheduled_unet_cpu_llvm_task.json
+            runs-on: nodai-amdgpu-w7900-x86-64
+
+          # AMD GPU
+          - name: amdgpu_rocm_gfx90a
+            models-config-file: pytorch_models_gpu_rocm_gfx90a.json
+            models-extra-flags-config-file: pytorch_models_gpu_rocm_gfx90a_additional_flags.json
+            sdxl-config-file: sdxl_scheduled_unet_gpu_rocm_gfx90a.json
+            runs-on: nodai-amdgpu-mi250-x86-64
+          - name: amdgpu_vulkan
+            models-config-file: pytorch_models_gpu_vulkan.json
+            runs-on: nodai-amdgpu-w7900-x86-64
+
+          # NVIDIA GPU
+          # None at the moment. Could maybe use the persistent a100 runners:
+          #   - self-hosted # must come first
+          #   - runner-group=${{ needs.setup.outputs.runner-group }}
+          #   - environment=${{ needs.setup.outputs.runner-env }}
+          #   - a100
+          #   - os-family=Linux
+          # (note: would need to plumb the presubmit/postsubmit runner-group through to here too)
     env:
       PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
       IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
-      VENV_DIR: ${{ github.workspace }}/venv
       IREE_TEST_FILES: ~/iree_tests_cache
       IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
+      MODELS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-config-file }}
+      MODELS_EXTRA_FLAGS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-extra-flags-config-file }}
+      SDXL_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-config-file }}
+      VENV_DIR: ${{ github.workspace }}/venv
     steps:
       - name: Checking out IREE repository
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
@@ -120,6 +187,15 @@ jobs:
             --artifact-path=${PACKAGE_DOWNLOAD_DIR} \
             --fetch-gh-workflow=${{ inputs.artifact_run_id }}
 
+      # TODO(#17344): regenerate .mlirbc files, test plat_rdna3_rocm on rocm
+      # # In-tree tests
+      # - name: Run experimental/regression_suite tests
+      #   run: |
+      #     source ${VENV_DIR}/bin/activate
+      #     pytest \
+      #       -rA -s -m "plat_host_cpu and presubmit" \
+      #       experimental/regression_suite
+
       # Out of tree tests
       - name: Check out external TestSuite repository
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
@@ -137,21 +213,23 @@ jobs:
         run: |
           source ${VENV_DIR}/bin/activate
           python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir pytorch/models
-      - name: Run external tests - Models with real weights
+
+      - name: Run external tests - models with real weights
+        if: "matrix.models-config-file != '' && !cancelled()"
         run: |
           source ${VENV_DIR}/bin/activate
           pytest SHARK-TestSuite/iree_tests/pytorch/models \
             -rpfE \
             -k real_weights \
             --no-skip-tests-missing-files \
-            --log-cli-level=info \
             --capture=no \
+            --log-cli-level=info \
             --timeout=1200 \
-            --retries 2 \
-            --retry-delay 5 \
             --durations=0 \
-            --config-files=build_tools/pkgci/external_test_suite/gpu_rocm_models_gfx90a.json
-      - name: Run external tests - Models with real weights and additional flags
+            --config-files=${MODELS_CONFIG_FILE_PATH}
+
+      - name: Run external tests - models with real weights and additional flags
+        if: "matrix.models-extra-flags-config-file != '' && !cancelled()"
         run: |
           source ${VENV_DIR}/bin/activate
           pytest SHARK-TestSuite/iree_tests/pytorch/models \
@@ -161,25 +239,25 @@ jobs:
             --capture=no \
             --log-cli-level=info \
             --timeout=1200 \
-            --retries 2 \
-            --retry-delay 5 \
             --durations=0 \
-            --config-files=build_tools/pkgci/external_test_suite/gpu_rocm_models_additional_flags_gfx90a.json
-      - name: "Running real weight model tests scheduled unet"
+            --config-files=${MODELS_EXTRA_FLAGS_CONFIG_FILE_PATH}
+
+      - name: "Run external tests - SDXL scheduled unet"
+        if: "matrix.sdxl-config-file != '' && !cancelled()"
         run: |
           source ${VENV_DIR}/bin/activate
           pytest SHARK-TestSuite/iree_tests/pytorch/models/sdxl-scheduled-unet-3-tank \
             -rpfE \
             -k real_weights \
             --no-skip-tests-missing-files \
-            --log-cli-level=info \
             --capture=no \
+            --log-cli-level=info \
             --timeout=1200 \
-            --retries 2 \
-            --retry-delay 5 \
             --durations=0 \
-            --config-files=build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json
+            --config-files=${SDXL_CONFIG_FILE_PATH}
+
       - name: "Running SDXL rocm pipeline benchmark"
+        if: contains(matrix.name, 'rocm')
         run: |
           source ${VENV_DIR}/bin/activate
           bash SHARK-TestSuite/iree_tests/benchmarks/benchmark_sdxl_rocm.sh