Skip to content

Commit

Permalink
Merge pkgci_test_ jobs using matrices. (iree-org#17512)
Browse files Browse the repository at this point in the history
Progress on iree-org#17136. Similar to
iree-org#17509, this folds the five
`regression_test_` jobs in pkgci.yml into two matrices, `test_onnx` and
`test_models`, with most steps shared.

The "model" tests are still verbose to configure here as we have
multiple variants on some backends. Depending on how much we want to
unify or diverge we could continue to refactor further.

ci-exactly: build_packages,regression_test
  • Loading branch information
ScottTodd authored May 29, 2024
1 parent 2c59505 commit b6b1df6
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 557 deletions.
32 changes: 4 additions & 28 deletions .github/workflows/pkgci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,35 +37,11 @@ jobs:
with:
package_version: 0.dev1

regression_test_cpu:
name: Regression Test CPU
regression_test:
name: Regression Test
needs: [setup, build_packages]
if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test_cpu')
uses: ./.github/workflows/pkgci_regression_test_cpu.yml

regression_test_amdgpu_vulkan:
name: Regression Test AMDGPU-Vulkan
needs: [setup, build_packages]
if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test_amdgpu_vulkan')
uses: ./.github/workflows/pkgci_regression_test_amdgpu_vulkan.yml

regression_test_amdgpu_rocm:
name: Regression Test AMDGPU-ROCm
needs: [setup, build_packages]
if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test_amdgpu_rocm')
uses: ./.github/workflows/pkgci_regression_test_amdgpu_rocm.yml

regression_test_nvidiagpu_vulkan:
name: Regression Test NVIDIAGPU-Vulkan
needs: [setup, build_packages]
if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test_nvidiagpu_vulkan')
uses: ./.github/workflows/pkgci_regression_test_nvidiagpu_vulkan.yml

regression_test_nvidiagpu_cuda:
name: Regression Test NVIDIAGPU-CUDA
needs: [setup, build_packages]
if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test_nvidiagpu_cuda')
uses: ./.github/workflows/pkgci_regression_test_nvidiagpu_cuda.yml
if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'regression_test')
uses: ./.github/workflows/pkgci_regression_test.yml

test_tensorflow_cpu:
name: Test TensorFlow CPU
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

name: PkgCI Regression Test (AMDGPU ROCm)
name: PkgCI Regression Test
on:
workflow_call:
inputs:
Expand All @@ -18,12 +18,54 @@ on:
default: ""

jobs:
linux_x86_64:
name: Linux (x86_64)
runs-on: nodai-amdgpu-w7900-x86-64
test_onnx:
name: "test_onnx :: ${{ matrix.name }}"
runs-on: ${{ matrix.runs-on }}
strategy:
fail-fast: false
matrix:
include:
# CPU
- name: cpu_llvm_sync
config-file: onnx_cpu_llvm_sync.json
numprocesses: auto
runs-on: ubuntu-20.04

# AMD GPU
- name: amdgpu_rocm_rdna3
numprocesses: 1
config-file: onnx_gpu_rocm_rdna3.json
runs-on: nodai-amdgpu-w7900-x86-64
- name: amdgpu_vulkan
numprocesses: 4
config-file: onnx_gpu_vulkan.json
runs-on: nodai-amdgpu-w7900-x86-64

# NVIDIA GPU
- name: nvidiagpu_cuda
config-file: onnx_gpu_cuda.json
numprocesses: 4
runs-on:
- self-hosted # must come first
- runner-group=${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}
- environment=prod
- gpu # TODO(scotttodd): qualify further with vendor/model
- os-family=Linux
- name: nvidiagpu_vulkan
config-file: onnx_gpu_vulkan.json
numprocesses: 4
runs-on:
- self-hosted # must come first
- runner-group=${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}
- environment=prod
- gpu # TODO(scotttodd): qualify further with vendor/model
- os-family=Linux
env:
PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.config-file }}
NUMPROCESSES: ${{ matrix.numprocesses }}
LOG_FILE_PATH: /tmp/iree_tests_onnx_${{ matrix.name }}_logs.json
VENV_DIR: ${{ github.workspace }}/venv
steps:
- name: Checking out IREE repository
Expand All @@ -44,63 +86,88 @@ jobs:
--artifact-path=${PACKAGE_DOWNLOAD_DIR} \
--fetch-gh-workflow=${{ inputs.artifact_run_id }}
# TODO(#17344): regenerate .mlirbc files
# # In-tree tests
# - name: Run experimental/regression_suite tests
# run: |
# source ${VENV_DIR}/bin/activate
# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm/hip/lib
# pytest \
# -rA -s -m "plat_rdna3_rocm and presubmit" \
# experimental/regression_suite

# Out of tree tests
# TODO(scotttodd): Increase parallelism when supported by the HIP HAL
# driver and/or test runner machine.
- name: Checking out external TestSuite repository
- name: Check out external TestSuite repository
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
with:
repository: nod-ai/SHARK-TestSuite
ref: c9b3337e1f754c83d178568be1339aaef5f08045
path: SHARK-TestSuite
submodules: false
- name: Installing external TestSuite Python requirements
lfs: false
- name: Install external TestSuite Python requirements
run: |
source ${VENV_DIR}/bin/activate
python -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt
- name: Run external tests - ONNX test suite
run: |
source ${VENV_DIR}/bin/activate
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm/hip/lib
pytest SHARK-TestSuite/iree_tests/onnx/ \
-rpfE --timeout=30 --durations=20 \
--config-files=build_tools/pkgci/external_test_suite/onnx_gpu_rocm_rdna3.json \
-rpfE \
--numprocesses ${NUMPROCESSES} \
--timeout=30 \
--durations=20 \
--no-skip-tests-missing-files \
--report-log=/tmp/iree_tests_onnx_gpu_rocm_rdna3_logs.json
--config-files=${CONFIG_FILE_PATH} \
--report-log=${LOG_FILE_PATH}
- name: "Updating config file with latest XFAIL lists"
if: failure()
run: |
source ${VENV_DIR}/bin/activate
python SHARK-TestSuite/iree_tests/update_config_xfails.py \
--log-file=/tmp/iree_tests_onnx_gpu_rocm_rdna3_logs.json \
--config-file=build_tools/pkgci/external_test_suite/onnx_gpu_rocm_rdna3.json
cat build_tools/pkgci/external_test_suite/onnx_gpu_rocm_rdna3.json
--log-file=${LOG_FILE_PATH} \
--config-file=${CONFIG_FILE_PATH}
cat ${CONFIG_FILE_PATH}
- name: "Uploading new config file"
if: failure()
uses: actions/upload-artifact@v4
with:
name: "onnx_gpu_rocm_rdna3.json"
path: "build_tools/pkgci/external_test_suite/onnx_gpu_rocm_rdna3.json"
name: "${{ matrix.config-file }}"
path: "${CONFIG_FILE_PATH}"

linux_x86_64_rocm_models:
name: MI250 - Models
runs-on: nodai-amdgpu-mi250-x86-64
test_models:
name: "test_models :: ${{ matrix.name }}"
runs-on: ${{ matrix.runs-on }}
strategy:
fail-fast: false

# Note: these jobs should use persistent runners with local caches.
# Downloading test files (50GB+) without a cache can take 20+ minutes.
matrix:
include:
# CPU
- name: cpu_llvm_task
models-config-file: pytorch_models_cpu_llvm_task.json
sdxl-config-file: sdxl_scheduled_unet_cpu_llvm_task.json
runs-on: nodai-amdgpu-w7900-x86-64

# AMD GPU
- name: amdgpu_rocm_gfx90a
models-config-file: pytorch_models_gpu_rocm_gfx90a.json
models-extra-flags-config-file: pytorch_models_gpu_rocm_gfx90a_additional_flags.json
sdxl-config-file: sdxl_scheduled_unet_gpu_rocm_gfx90a.json
runs-on: nodai-amdgpu-mi250-x86-64
- name: amdgpu_vulkan
models-config-file: pytorch_models_gpu_vulkan.json
runs-on: nodai-amdgpu-w7900-x86-64

# NVIDIA GPU
# None at the moment. Could maybe use the persistent a100 runners:
# - self-hosted # must come first
# - runner-group=${{ needs.setup.outputs.runner-group }}
# - environment=${{ needs.setup.outputs.runner-env }}
# - a100
# - os-family=Linux
# (note: would need to plumb the presubmit/postsubmit runner-group through to here too)
env:
PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
VENV_DIR: ${{ github.workspace }}/venv
IREE_TEST_FILES: ~/iree_tests_cache
IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
MODELS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-config-file }}
MODELS_EXTRA_FLAGS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-extra-flags-config-file }}
SDXL_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-config-file }}
VENV_DIR: ${{ github.workspace }}/venv
steps:
- name: Checking out IREE repository
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
Expand All @@ -120,6 +187,15 @@ jobs:
--artifact-path=${PACKAGE_DOWNLOAD_DIR} \
--fetch-gh-workflow=${{ inputs.artifact_run_id }}
# TODO(#17344): regenerate .mlirbc files, test plat_rdna3_rocm on rocm
# # In-tree tests
# - name: Run experimental/regression_suite tests
# run: |
# source ${VENV_DIR}/bin/activate
# pytest \
# -rA -s -m "plat_host_cpu and presubmit" \
# experimental/regression_suite

# Out of tree tests
- name: Check out external TestSuite repository
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
Expand All @@ -137,21 +213,23 @@ jobs:
run: |
source ${VENV_DIR}/bin/activate
python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir pytorch/models
- name: Run external tests - Models with real weights
- name: Run external tests - models with real weights
if: "matrix.models-config-file != '' && !cancelled()"
run: |
source ${VENV_DIR}/bin/activate
pytest SHARK-TestSuite/iree_tests/pytorch/models \
-rpfE \
-k real_weights \
--no-skip-tests-missing-files \
--log-cli-level=info \
--capture=no \
--log-cli-level=info \
--timeout=1200 \
--retries 2 \
--retry-delay 5 \
--durations=0 \
--config-files=build_tools/pkgci/external_test_suite/gpu_rocm_models_gfx90a.json
- name: Run external tests - Models with real weights and additional flags
--config-files=${MODELS_CONFIG_FILE_PATH}
- name: Run external tests - models with real weights and additional flags
if: "matrix.models-extra-flags-config-file != '' && !cancelled()"
run: |
source ${VENV_DIR}/bin/activate
pytest SHARK-TestSuite/iree_tests/pytorch/models \
Expand All @@ -161,25 +239,25 @@ jobs:
--capture=no \
--log-cli-level=info \
--timeout=1200 \
--retries 2 \
--retry-delay 5 \
--durations=0 \
--config-files=build_tools/pkgci/external_test_suite/gpu_rocm_models_additional_flags_gfx90a.json
- name: "Running real weight model tests scheduled unet"
--config-files=${MODELS_EXTRA_FLAGS_CONFIG_FILE_PATH}
- name: "Run external tests - SDXL scheduled unet"
if: "matrix.sdxl-config-file != '' && !cancelled()"
run: |
source ${VENV_DIR}/bin/activate
pytest SHARK-TestSuite/iree_tests/pytorch/models/sdxl-scheduled-unet-3-tank \
-rpfE \
-k real_weights \
--no-skip-tests-missing-files \
--log-cli-level=info \
--capture=no \
--log-cli-level=info \
--timeout=1200 \
--retries 2 \
--retry-delay 5 \
--durations=0 \
--config-files=build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json
--config-files=${SDXL_CONFIG_FILE_PATH}
- name: "Running SDXL rocm pipeline benchmark"
if: contains(matrix.name, 'rocm')
run: |
source ${VENV_DIR}/bin/activate
bash SHARK-TestSuite/iree_tests/benchmarks/benchmark_sdxl_rocm.sh
Loading

0 comments on commit b6b1df6

Please sign in to comment.