Skip to content

Commit

Permalink
CI: run NCCL tests on AWS with NGC release candidate images (#1234)
Browse files Browse the repository at this point in the history
#1233 pushes the relevant
subset to `main`.
  • Loading branch information
olupton authored Jan 13, 2025
1 parent a13a946 commit 2cab2b9
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 119 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/_build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,13 @@ jobs:
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Login to NVIDIA Container Registry
uses: docker/login-action@v3
with:
registry: nvcr.io
username: $oauthtoken
password: ${{ secrets.NVCR_TOKEN }}

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
Expand Down
137 changes: 137 additions & 0 deletions .github/workflows/_test_nccl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
name: ~run NCCL tests

on:
workflow_call:
inputs:
# Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda
# images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought
# to be modified to test one of the JAX-Toolbox containers.
CONTAINER:
type: string
description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
required: true

permissions:
actions: write # to cancel previous workflows
contents: read # to fetch code
packages: write # to upload container

jobs:
build-mpi-operator-compatible-base:
uses: ./.github/workflows/_build.yaml
with:
ARCHITECTURE: amd64
ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
BADGE_FILENAME: badge-mpi-operator-compatible-base-build
BUILD_DATE: 0000-00-00 # not important; this image is never published
BASE_IMAGE: ${{ inputs.CONTAINER }}
CONTAINER_NAME: mpi-operator-compatible-base
DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
RUNNER_SIZE: small
secrets: inherit
nccl-test:
needs: build-mpi-operator-compatible-base
strategy:
matrix:
test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
runs-on: eks
env:
BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
TEST_NAME: ${{ matrix.test }}
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Login to NVIDIA Container Registry
uses: docker/login-action@v3
with:
registry: nvcr.io
username: $oauthtoken
password: ${{ secrets.NVCR_TOKEN }}
- name: Store GitHub Container Registry token as Kubernetes secret
run: |
# Replace underscores in TEST_NAME with - to make a valid Kubernetes name
JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
LAUNCHER_NAME="${JOB_NAME}-launcher"
TOKEN_NAME="${JOB_NAME}-token"
# Make these available to later steps
echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
kubectl create secret generic \
${TOKEN_NAME} \
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
--type=kubernetes.io/dockerconfigjson
- name: Configure Kubernetes job
run: |
export WORKER_NAME="${JOB_NAME}-worker"
yq -i '.metadata.name = strenv(JOB_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
.github/eks-workflow-files/mpi-nccl-test.yml
git diff .github/eks-workflow-files/mpi-nccl-test.yml
- name: Submit Kubernetes job
run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
- name: Wait for Kubernetes job to start
# Note that this is *not* using JOB_NAME
run: |
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
# resources are available, but that is where there can be a long wait if the
# cluster is busy executing other jobs.
kubectl wait --for=create job/${LAUNCHER_NAME}
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s
- name: Stream Kubernetes job output
# Note that this is *not* JOB_NAME
run: |
# Streaming logs will fail if the container/pod is still pending
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
sleep 1
done
# TODO: --all-containers=true --all-pods=true could make sense here, but it
# prefixes lines with a rather verbose tag
kubectl logs --follow job/${LAUNCHER_NAME}
- name: Retrieve Kubernetes job status
shell: bash -exo pipefail {0}
run: |
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
failure=${status[0]:-0}
success=${status[1]:-0}
total=$((failure+success))
if [[ ${total} < 1 ]]; then
sleep 1
elif [[ ${total} == 1 ]]; then
break
else
# Shouldn't happen, maybe a sign the job being monitored does not have a
# single launcher pod?
exit 255
fi
done
exit ${failure}
# Provide more debug output in case of failure; note that some kinds of launch
# failure do not produce any log output.
- name: Debug failed Kubernetes job
if: failure()
run: |
# Provide better debug in case of launch failures that will not produce log output
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
if [[ -n "${pods}" ]]; then
kubectl describe ${pods}
fi
# Clean up in case of errors as well as success
- name: Delete Kubernetes job
if: always()
run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
- name: Delete GitHub Container Registry token
if: always()
run: kubectl delete secret ${TOKEN_NAME}
7 changes: 7 additions & 0 deletions .github/workflows/_test_unit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ jobs:
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Login to NVIDIA Container Registry
uses: docker/login-action@v3
with:
registry: nvcr.io
username: $oauthtoken
password: ${{ secrets.NVCR_TOKEN }}

- name: Run tests
shell: bash -x -e {0}
continue-on-error: true
Expand Down
125 changes: 10 additions & 115 deletions .github/workflows/nccl-k8s.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,129 +15,24 @@ on:
# Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda
# images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought
# to be modified to test one of the JAX-Toolbox containers.
CUDA_IMAGE:
CONTAINER:
type: string
description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
description: Container to test, this is assumed to already contain the NCCL tests e.g. cuda-dl-base or derived
default: ''
required: false

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

permissions:
actions: write # to cancel previous workflows
contents: read # to fetch code
actions: write # to cancel previous workflows
contents: read # to fetch code
packages: write # to upload container

jobs:
build-mpi-operator-compatible-base:
uses: ./.github/workflows/_build.yaml
nccl-tests:
uses: ./.github/workflows/_test_nccl.yaml
with:
ARCHITECTURE: amd64
ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
BADGE_FILENAME: badge-mpi-operator-compatible-base-build
BUILD_DATE: 0000-00-00 # not important; this image is never published
BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04' }}
CONTAINER_NAME: mpi-operator-compatible-base
DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
RUNNER_SIZE: small
CONTAINER: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04' }}
secrets: inherit
# TODO: expand beyond all-reduce
nccl-test:
needs: build-mpi-operator-compatible-base
strategy:
matrix:
test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
runs-on: eks
env:
BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
TEST_NAME: ${{ matrix.test }}
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Store GitHub Container Registry token as Kubernetes secret
run: |
# Replace underscores in TEST_NAME with - to make a valid Kubernetes name
JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
LAUNCHER_NAME="${JOB_NAME}-launcher"
TOKEN_NAME="${JOB_NAME}-token"
# Make these available to later steps
echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
kubectl create secret generic \
${TOKEN_NAME} \
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
--type=kubernetes.io/dockerconfigjson
- name: Configure Kubernetes job
run: |
export WORKER_NAME="${JOB_NAME}-worker"
yq -i '.metadata.name = strenv(JOB_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
.github/eks-workflow-files/mpi-nccl-test.yml
git diff .github/eks-workflow-files/mpi-nccl-test.yml
- name: Submit Kubernetes job
run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
- name: Wait for Kubernetes job to start
# Note that this is *not* using JOB_NAME
run: |
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
# resources are available, but that is where there can be a long wait if the
# cluster is busy executing other jobs.
kubectl wait --for=create job/${LAUNCHER_NAME}
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s
- name: Stream Kubernetes job output
# Note that this is *not* JOB_NAME
run: |
# Streaming logs will fail if the container/pod is still pending
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
sleep 1
done
# TODO: --all-containers=true --all-pods=true could make sense here, but it
# prefixes lines with a rather verbose tag
kubectl logs --follow job/${LAUNCHER_NAME}
- name: Retrieve Kubernetes job status
shell: bash -exo pipefail {0}
run: |
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
failure=${status[0]:-0}
success=${status[1]:-0}
total=$((failure+success))
if [[ ${total} < 1 ]]; then
sleep 1
elif [[ ${total} == 1 ]]; then
break
else
# Shouldn't happen, maybe a sign the job being monitored does not have a
# single launcher pod?
exit 255
fi
done
exit ${failure}
# Provide more debug output in case of failure; note that some kinds of launch
# failure do not produce any log output.
- name: Debug failed Kubernetes job
if: failure()
run: |
# Provide better debug in case of launch failures that will not produce log output
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
if [[ -n "${pods}" ]]; then
kubectl describe ${pods}
fi
# Clean up in case of errors as well as success
- name: Delete Kubernetes job
if: always()
run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
- name: Delete GitHub Container Registry token
if: always()
run: kubectl delete secret ${TOKEN_NAME}
15 changes: 11 additions & 4 deletions .github/workflows/ngc-release-testing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: ~NGC release testing
on:
workflow_dispatch:
inputs:
JAX_IMAGE:
JAX_IMAGE:
type: string
description: "JAX image to run tests on"
required: false
Expand All @@ -26,9 +26,16 @@ concurrency:
permissions:
contents: read # to fetch code
actions: write # to cancel previous workflows
packages: read # to upload container
packages: write # to upload container

jobs:
test-nccl:
if: inputs.JAX_IMAGE != ''
uses: ./.github/workflows/_test_nccl.yaml
with:
CONTAINER: ${{ inputs.JAX_IMAGE }}
secrets: inherit

test-jax:
if: inputs.JAX_IMAGE != ''
uses: ./.github/workflows/_test_unit.yaml
Expand Down Expand Up @@ -65,7 +72,7 @@ jobs:
with:
PAX_IMAGE: ${{ inputs.PAX_IMAGE }}
secrets: inherit

test-maxtext:
if: inputs.MAXTEXT_IMAGE != ''
uses: ./.github/workflows/_test_maxtext.yaml
Expand All @@ -74,7 +81,7 @@ jobs:
secrets: inherit

finalize:
needs: [ test-jax, test-rosetta-pax, test-maxtext ]
needs: [ test-nccl, test-jax, test-rosetta-pax, test-maxtext ]
if: "!cancelled()"
uses: ./.github/workflows/_finalize.yaml
secrets: inherit

0 comments on commit 2cab2b9

Please sign in to comment.