Merge branch 'main' into olupton/nsys-jax-python-opt

NVIDIA · Dec 16, 2024 · 61357e9 · 61357e9
2 parents d8056e0 + 0e9abfa
commit 61357e9
Show file tree

Hide file tree

Showing 7 changed files with 239 additions and 10 deletions.
diff --git a/.github/container/Dockerfile.mpi-operator-compatible-base b/.github/container/Dockerfile.mpi-operator-compatible-base
@@ -0,0 +1,12 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE} as mealkit
+FROM mealkit as final
+RUN apt-get update \
+    && apt install -y openssh-server \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && mkdir /run/sshd
+# https://github.com/kubeflow/mpi-operator/blob/c738a83b185b4bf3bf7e6eca9d4503653294c995/build/base/Dockerfile#L16
+RUN sed -i "s/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g" /etc/ssh/ssh_config \
+    && echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
+    && sed -i "s/#\(StrictModes \).*/\1no/g" /etc/ssh/sshd_config
diff --git a/.github/container/build-jax.sh b/.github/container/build-jax.sh
@@ -185,6 +185,8 @@ case "${CPU_ARCH}" in
         ;;
     "arm64")
         export CC_OPT_FLAGS="-march=armv8-a"
+        # ARM ACL build issue introduced in PR#23225
+        BUILD_PARAM="${BUILD_PARAM} --disable_mkl_dnn"
         ;;
 esac
 

diff --git a/.github/eks-workflow-files/job.yml b/.github/eks-workflow-files/job.yml
@@ -11,6 +11,8 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: PLACEHOLDER
+  labels:
+    kueue.x-k8s.io/queue-name: p5-queue
 spec:
   completions: 2 # number of nodes
   parallelism: 2 # number of nodes

diff --git a/.github/eks-workflow-files/mpi-nccl-test.yml b/.github/eks-workflow-files/mpi-nccl-test.yml
@@ -0,0 +1,80 @@
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: PLACEHOLDER
+  labels:
+    kueue.x-k8s.io/queue-name: p5-queue
+spec:
+  runPolicy:
+    cleanPodPolicy: Running
+    # surface errors direct to GitHub Actions without Kubernetes-internal retries
+    backoffLimit: 0
+    # start suspended, let kueue unblock
+    suspend: true
+  # 1 MPI rank per GPU
+  slotsPerWorker: 8
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      # Without this the launcher pod will be deleted on failure, which makes it hard
+      # to provide useful diagnostics
+      restartPolicy: Never
+      template:
+        spec:
+          containers:
+            - image: PLACEHOLDER
+              imagePullPolicy: IfNotPresent
+              name: PLACEHOLDER
+              command:
+                - bash
+                - -c
+                - |
+                  # kueue breaks the WaitForWorkersReady policy that mpi-operator
+                  # nominally supports, so manually wait a while for a basic mpirun to
+                  # start working (i.e. for the workers to be ready) before doing
+                  # anything interesting, instead of relying on mpi-operator not to
+                  # start the launcher before it is expected to succeed. This issue
+                  # seems related: https://github.com/kubeflow/mpi-operator/pull/617
+                  limit=5m
+                  if ! timeout ${limit} sh -c "while ! mpirun --allow-run-as-root -N 1 hostname; do sleep 5; done"; then
+                    echo "Workers were still not reachable after ${limit}, exiting"
+                    exit 1
+                  fi
+                  mpirun --allow-run-as-root -np 16 -N 8 $0 \
+                    -b 8 \
+                    -e 16G \
+                    -f 2 \
+                    -g 1 \
+                    -c 1 \
+                    -n 100
+                - PLACEHOLDER
+              resources:
+                limits:
+                  cpu: 1
+          imagePullSecrets:
+            - name: PLACEHOLDER
+    Worker:
+      replicas: 2
+      template:
+        spec:
+          nodeSelector:
+            node.kubernetes.io/instance-type: "p5.48xlarge"
+          containers:
+            - image: PLACEHOLDER
+              imagePullPolicy: IfNotPresent
+              name: PLACEHOLDER
+              volumeMounts:
+                - name: shmem
+                  mountPath: /dev/shm
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+                  hugepages-2Mi: 5120Mi
+                  vpc.amazonaws.com/efa: 32
+                  memory: 32000Mi
+          imagePullSecrets:
+            - name: PLACEHOLDER
+          volumes:
+            - name: shmem
+              hostPath:
+                path: /dev/shm
diff --git a/.github/eks-workflow-files/post-process-job.yml b/.github/eks-workflow-files/post-process-job.yml
@@ -32,10 +32,6 @@ spec:
             - pipefail
             - -c
             - nsys-jax-combine -o /opt/output/combined.zip /opt/output/*.zip --analysis communication
-          # FIXME: GPU not actually needed, but the test cluster doesn't have appropriate non-GPU nodes
-          resources:
-            limits:
-              nvidia.com/gpu: 1
           volumeMounts:
             - mountPath: /opt/output
               name: output

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
@@ -455,12 +455,6 @@ jobs:
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
-    - name: Install yq
-      run: |
-        mkdir local_bin/
-        curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture)
-        chmod 777 ./local_bin/yq
-        echo "${PWD}/local_bin" >> "${GITHUB_PATH}"
     - name: Login to GitHub Container Registry
       uses: docker/login-action@v3
       with:

diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml
@@ -0,0 +1,143 @@
+name: NCCL on Kubernetes
+on:
+  schedule:
+    - cron: '30 8 * * *'
+  pull_request:
+    types:
+      - opened
+      - reopened
+      - ready_for_review
+      - synchronize
+    paths-ignore:
+      - '**.md'
+  workflow_dispatch:
+    inputs:
+      # Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda
+      # images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought
+      # to be modified to test one of the JAX-Toolbox containers.
+      CUDA_IMAGE:
+        type: string
+        description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
+        default: ''
+        required: false
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+permissions:
+  actions: write # to cancel previous workflows
+  contents: read # to fetch code
+  packages: write # to upload container
+jobs:
+  build-mpi-operator-compatible-base:
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: amd64
+      ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
+      BADGE_FILENAME: badge-mpi-operator-compatible-base-build
+      BUILD_DATE: 0000-00-00 # not important; this image is never published
+      BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04' }}
+      CONTAINER_NAME: mpi-operator-compatible-base
+      DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
+      RUNNER_SIZE: small
+    secrets: inherit
+  # TODO: expand beyond all-reduce
+  nccl-test:
+    needs: build-mpi-operator-compatible-base
+    strategy:
+      matrix:
+        test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
+    runs-on: eks
+    env:
+      BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
+      TEST_NAME: ${{ matrix.test }}
+    steps:
+      - name: Check out the repository
+        uses: actions/checkout@v4
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Store GitHub Container Registry token as Kubernetes secret
+        run: |
+          # Replace underscores in TEST_NAME with - to make a valid Kubernetes name
+          JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
+          LAUNCHER_NAME="${JOB_NAME}-launcher"
+          TOKEN_NAME="${JOB_NAME}-token"
+          # Make these available to later steps
+          echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
+          echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
+          echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
+          kubectl create secret generic \
+            ${TOKEN_NAME} \
+            --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+            --type=kubernetes.io/dockerconfigjson
+      - name: Configure Kubernetes job
+        run: |
+          export WORKER_NAME="${JOB_NAME}-worker"
+          yq -i '.metadata.name = strenv(JOB_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
+            | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
+            | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
+            | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
+            .github/eks-workflow-files/mpi-nccl-test.yml
+          git diff .github/eks-workflow-files/mpi-nccl-test.yml
+      - name: Submit Kubernetes job
+        run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
+      - name: Wait for Kubernetes job to start
+        # Note that this is *not* using JOB_NAME
+        run: |
+          # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
+          # resources are available, but that is where there can be a long wait if the
+          # cluster is busy executing other jobs.
+          kubectl wait --for=create job/${LAUNCHER_NAME}
+          kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s
+      - name: Stream Kubernetes job output
+        # Note that this is *not* JOB_NAME
+        run: |
+          # Streaming logs will fail if the container/pod is still pending
+          while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+            sleep 1
+          done
+          # TODO: --all-containers=true --all-pods=true could make sense here, but it
+          # prefixes lines with a rather verbose tag
+          kubectl logs --follow job/${LAUNCHER_NAME}
+      - name: Retrieve Kubernetes job status
+        shell: bash -exo pipefail {0}
+        run: |
+          while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
+            failure=${status[0]:-0}
+            success=${status[1]:-0}
+            total=$((failure+success))
+            if [[ ${total} < 1 ]]; then
+              sleep 1
+            elif [[ ${total} == 1 ]]; then
+              break
+            else
+              # Shouldn't happen, maybe a sign the job being monitored does not have a
+              # single launcher pod?
+              exit 255
+            fi
+          done
+          exit ${failure}
+      # Provide more debug output in case of failure; note that some kinds of launch
+      # failure do not produce any log output.
+      - name: Debug failed Kubernetes job
+        if: failure()
+        run: |
+          # Provide better debug in case of launch failures that will not produce log output
+          pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
+          if [[ -n "${pods}" ]]; then
+            kubectl describe ${pods}
+          fi
+      # Clean up in case of errors as well as success
+      - name: Delete Kubernetes job
+        if: always()
+        run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
+      - name: Delete GitHub Container Registry token
+        if: always()
+        run: kubectl delete secret ${TOKEN_NAME}