Skip to content

Commit

Permalink
Merge branch 'main' into olupton/nsys-jax-python-opt
Browse files Browse the repository at this point in the history
  • Loading branch information
olupton authored Dec 16, 2024
2 parents d8056e0 + 0e9abfa commit 61357e9
Show file tree
Hide file tree
Showing 7 changed files with 239 additions and 10 deletions.
12 changes: 12 additions & 0 deletions .github/container/Dockerfile.mpi-operator-compatible-base
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE} as mealkit
FROM mealkit as final
RUN apt-get update \
&& apt install -y openssh-server \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir /run/sshd
# https://github.com/kubeflow/mpi-operator/blob/c738a83b185b4bf3bf7e6eca9d4503653294c995/build/base/Dockerfile#L16
RUN sed -i "s/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g" /etc/ssh/ssh_config \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
&& sed -i "s/#\(StrictModes \).*/\1no/g" /etc/ssh/sshd_config
2 changes: 2 additions & 0 deletions .github/container/build-jax.sh
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ case "${CPU_ARCH}" in
;;
"arm64")
export CC_OPT_FLAGS="-march=armv8-a"
# ARM ACL build issue introduced in PR#23225
BUILD_PARAM="${BUILD_PARAM} --disable_mkl_dnn"
;;
esac

Expand Down
2 changes: 2 additions & 0 deletions .github/eks-workflow-files/job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ apiVersion: batch/v1
kind: Job
metadata:
name: PLACEHOLDER
labels:
kueue.x-k8s.io/queue-name: p5-queue
spec:
completions: 2 # number of nodes
parallelism: 2 # number of nodes
Expand Down
80 changes: 80 additions & 0 deletions .github/eks-workflow-files/mpi-nccl-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: PLACEHOLDER
labels:
kueue.x-k8s.io/queue-name: p5-queue
spec:
runPolicy:
cleanPodPolicy: Running
# surface errors direct to GitHub Actions without Kubernetes-internal retries
backoffLimit: 0
# start suspended, let kueue unblock
suspend: true
# 1 MPI rank per GPU
slotsPerWorker: 8
mpiReplicaSpecs:
Launcher:
replicas: 1
# Without this the launcher pod will be deleted on failure, which makes it hard
# to provide useful diagnostics
restartPolicy: Never
template:
spec:
containers:
- image: PLACEHOLDER
imagePullPolicy: IfNotPresent
name: PLACEHOLDER
command:
- bash
- -c
- |
# kueue breaks the WaitForWorkersReady policy that mpi-operator
# nominally supports, so manually wait a while for a basic mpirun to
# start working (i.e. for the workers to be ready) before doing
# anything interesting, instead of relying on mpi-operator not to
# start the launcher before it is expected to succeed. This issue
# seems related: https://github.com/kubeflow/mpi-operator/pull/617
limit=5m
if ! timeout ${limit} sh -c "while ! mpirun --allow-run-as-root -N 1 hostname; do sleep 5; done"; then
echo "Workers were still not reachable after ${limit}, exiting"
exit 1
fi
mpirun --allow-run-as-root -np 16 -N 8 $0 \
-b 8 \
-e 16G \
-f 2 \
-g 1 \
-c 1 \
-n 100
- PLACEHOLDER
resources:
limits:
cpu: 1
imagePullSecrets:
- name: PLACEHOLDER
Worker:
replicas: 2
template:
spec:
nodeSelector:
node.kubernetes.io/instance-type: "p5.48xlarge"
containers:
- image: PLACEHOLDER
imagePullPolicy: IfNotPresent
name: PLACEHOLDER
volumeMounts:
- name: shmem
mountPath: /dev/shm
resources:
limits:
nvidia.com/gpu: 8
hugepages-2Mi: 5120Mi
vpc.amazonaws.com/efa: 32
memory: 32000Mi
imagePullSecrets:
- name: PLACEHOLDER
volumes:
- name: shmem
hostPath:
path: /dev/shm
4 changes: 0 additions & 4 deletions .github/eks-workflow-files/post-process-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,6 @@ spec:
- pipefail
- -c
- nsys-jax-combine -o /opt/output/combined.zip /opt/output/*.zip --analysis communication
# FIXME: GPU not actually needed, but the test cluster doesn't have appropriate non-GPU nodes
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- mountPath: /opt/output
name: output
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -455,12 +455,6 @@ jobs:
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Install yq
run: |
mkdir local_bin/
curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture)
chmod 777 ./local_bin/yq
echo "${PWD}/local_bin" >> "${GITHUB_PATH}"
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
Expand Down
143 changes: 143 additions & 0 deletions .github/workflows/nccl-k8s.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
name: NCCL on Kubernetes
on:
schedule:
- cron: '30 8 * * *'
pull_request:
types:
- opened
- reopened
- ready_for_review
- synchronize
paths-ignore:
- '**.md'
workflow_dispatch:
inputs:
# Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda
# images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought
# to be modified to test one of the JAX-Toolbox containers.
CUDA_IMAGE:
type: string
description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
default: ''
required: false
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
permissions:
actions: write # to cancel previous workflows
contents: read # to fetch code
packages: write # to upload container
jobs:
build-mpi-operator-compatible-base:
uses: ./.github/workflows/_build.yaml
with:
ARCHITECTURE: amd64
ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
BADGE_FILENAME: badge-mpi-operator-compatible-base-build
BUILD_DATE: 0000-00-00 # not important; this image is never published
BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04' }}
CONTAINER_NAME: mpi-operator-compatible-base
DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
RUNNER_SIZE: small
secrets: inherit
# TODO: expand beyond all-reduce
nccl-test:
needs: build-mpi-operator-compatible-base
strategy:
matrix:
test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
runs-on: eks
env:
BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
TEST_NAME: ${{ matrix.test }}
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Store GitHub Container Registry token as Kubernetes secret
run: |
# Replace underscores in TEST_NAME with - to make a valid Kubernetes name
JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
LAUNCHER_NAME="${JOB_NAME}-launcher"
TOKEN_NAME="${JOB_NAME}-token"
# Make these available to later steps
echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
kubectl create secret generic \
${TOKEN_NAME} \
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
--type=kubernetes.io/dockerconfigjson
- name: Configure Kubernetes job
run: |
export WORKER_NAME="${JOB_NAME}-worker"
yq -i '.metadata.name = strenv(JOB_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
.github/eks-workflow-files/mpi-nccl-test.yml
git diff .github/eks-workflow-files/mpi-nccl-test.yml
- name: Submit Kubernetes job
run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
- name: Wait for Kubernetes job to start
# Note that this is *not* using JOB_NAME
run: |
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
# resources are available, but that is where there can be a long wait if the
# cluster is busy executing other jobs.
kubectl wait --for=create job/${LAUNCHER_NAME}
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s
- name: Stream Kubernetes job output
# Note that this is *not* JOB_NAME
run: |
# Streaming logs will fail if the container/pod is still pending
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
sleep 1
done
# TODO: --all-containers=true --all-pods=true could make sense here, but it
# prefixes lines with a rather verbose tag
kubectl logs --follow job/${LAUNCHER_NAME}
- name: Retrieve Kubernetes job status
shell: bash -exo pipefail {0}
run: |
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
failure=${status[0]:-0}
success=${status[1]:-0}
total=$((failure+success))
if [[ ${total} < 1 ]]; then
sleep 1
elif [[ ${total} == 1 ]]; then
break
else
# Shouldn't happen, maybe a sign the job being monitored does not have a
# single launcher pod?
exit 255
fi
done
exit ${failure}
# Provide more debug output in case of failure; note that some kinds of launch
# failure do not produce any log output.
- name: Debug failed Kubernetes job
if: failure()
run: |
# Provide better debug in case of launch failures that will not produce log output
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
if [[ -n "${pods}" ]]; then
kubectl describe ${pods}
fi
# Clean up in case of errors as well as success
- name: Delete Kubernetes job
if: always()
run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
- name: Delete GitHub Container Registry token
if: always()
run: kubectl delete secret ${TOKEN_NAME}

0 comments on commit 61357e9

Please sign in to comment.