Skip to content

Commit

Permalink
Test NGC candidate MaxText container on AWS/K8s
Browse files Browse the repository at this point in the history
  • Loading branch information
olupton committed Jan 13, 2025
1 parent 2cab2b9 commit 481e71b
Show file tree
Hide file tree
Showing 3 changed files with 237 additions and 3 deletions.
120 changes: 120 additions & 0 deletions .github/eks-workflow-files/maxtext-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
apiVersion: v1
kind: Service
metadata:
name: PLACEHOLDER
spec:
clusterIP: None # clusterIP must be None to create a headless service
selector:
job-name: PLACEHOLDER # must match Job name
---
apiVersion: batch/v1
kind: Job
metadata:
name: PLACEHOLDER
labels:
kueue.x-k8s.io/queue-name: p5-queue
spec:
completions: 2 # number of nodes
parallelism: 2 # number of nodes
completionMode: Indexed
backoffLimitPerIndex: 0 # max failures per index
maxFailedIndexes: 0 # all indices must succeed
template:
spec:
subdomain: PLACEHOLDER # has to match Service name
restartPolicy: Never
imagePullSecrets:
- name: PLACEHOLDER
containers:
- name: maxtext
image: PLACEHOLDER
ports:
- containerPort: 3389
command:
- bash
- -c
# The logging logic: stream stdout/stderr from the 0th process inside this pod,
# record all of the processes' stdout/stderr + the INFO-level NCCL logs to file
- |
export SERVICE_NAME=$0
export JOB_NAME=$1
cat >each-process.sh <<'EOL'
export JAX_COORDINATOR_IP=${JOB_NAME}-0.${SERVICE_NAME}
export JAX_COORDINATOR_PORT=3389
export NNODES=16 # actually #processes == #GPUs
export NODE_RANK=$((JOB_COMPLETION_INDEX*8 + LOCAL_RANK))
export JAX_LOCAL_DEVICE_IDS=$LOCAL_RANK
export NCCL_DEBUG=INFO
export NCCL_DEBUG_FILE=/opt/output/nccl.$NODE_RANK.log
[[ $LOCAL_RANK == 0 ]] && console="/dev/stdout" || console="/dev/null"
nsys-jax \
--capture-range=cudaProfilerApi \
--capture-range-end=stop \
-o /opt/output/profile.$NODE_RANK.zip \
-- \
test-maxtext.sh \
-n 2 \
-b 2 \
--model-name=llama2-7b \
--attn-type=cudnn_flash_te \
--remat-policy=minimal_flash \
--steps=20 \
--fsdp=16 \
-a "scan_layers=false \
max_target_length=4096 \
use_iota_embed=true \
logits_dot_in_fp32=false \
profiler=nsys \
skip_first_n_steps_for_profiler=3 \
profiler_steps=8" \
|& tee /opt/output/output.$NODE_RANK.log >"${console}"
code=$?
# Should run even on failure
cat /opt/output/nccl.$NODE_RANK.log >"${console}"
exit $code
EOL
# TODO: upgrade parallel-launch to return a failure code as soon as any
# of its children do (it already does this eventually, but it could
# be slow)
parallel-launch LOCAL_RANK 8 bash each-process.sh
code=$?
# Should run even on failure
touch /opt/output/.done
exit $code
- PLACEHOLDER
- PLACEHOLDER
resources:
limits:
nvidia.com/gpu: 8
vpc.amazonaws.com/efa: 32
volumeMounts:
- mountPath: /dev/shm
name: shmem
- mountPath: /opt/output
name: output
- name: upload
image: amazon/aws-cli
command:
- bash
- -c
- |
JOB_NAME="$0"
while [[ ! -f /opt/output/.done ]]; do
sleep 1
done
rm /opt/output/.done
aws s3 cp \
--recursive \
/opt/output \
"s3://jax-toolbox-eks-output/${JOB_NAME}/"
- PLACEHOLDER
volumeMounts:
- mountPath: /opt/output
name: output
volumes:
- name: output
emptyDir: {}
- name: shmem
emptyDir:
medium: Memory
sizeLimit: 16Gi
107 changes: 107 additions & 0 deletions .github/workflows/_test_maxtext_k8s.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
name: ~test MaxText functionality on Kubernetes

on:
workflow_call:
inputs:
MAXTEXT_IMAGE:
type: string
description: MaxText container to test
required: true

permissions:
contents: read # to fetch code

jobs:
maxtext:
runs-on: eks
env:
CONTAINER_IMAGE: "${{ inputs.MAXTEXT_IMAGE }}"
JOB_NAME: "maxtext-${{ github.run_id }}-${{ github.run_attempt }}"
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Login to NVIDIA Container Registry
uses: docker/login-action@v3
with:
registry: nvcr.io
username: $oauthtoken
password: ${{ secrets.NVCR_TOKEN }}
- name: Store GitHub Container Registry token as Kubernetes secret
run: |
# Make this available to later steps
TOKEN_NAME="${JOB_NAME}-token"
echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
kubectl create secret generic \
${TOKEN_NAME} \
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
--type=kubernetes.io/dockerconfigjson
- name: Configure Kubernetes job
run: |
export SERVICE_NAME="${JOB_NAME}-svc"
yq -i ea 'select(di == 0).metadata.name = strenv(SERVICE_NAME)
| select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
| select(di == 1).metadata.name = strenv(JOB_NAME)
| select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME)
| select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
| select(di == 1).spec.template.spec.containers[0].image = strenv(CONTAINER_IMAGE)
| select(di == 1).spec.template.spec.containers[0].command[3] = strenv(SERVICE_NAME)
| select(di == 1).spec.template.spec.containers[0].command[4] = strenv(JOB_NAME)
| select(di == 1).spec.template.spec.containers[1].command[3] = strenv(JOB_NAME)' \
.github/eks-workflow-files/maxtext-job.yaml
git diff .github/eks-workflow-files/maxtext-job.yaml
- name: Submit Kubernetes job
run: kubectl apply -f .github/eks-workflow-files/maxtext-job.yaml
- name: Wait for Kubernetes job to start
run: |
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
# resources are available, but that is where there can be a long wait if the
# cluster is busy executing other jobs.
kubectl wait --for=create job/${JOB_NAME}
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${JOB_NAME} --timeout=3600s
- name: Stream Kubernetes job output
run: |
# Streaming logs will fail if the container/pod is still pending
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
sleep 1
done
kubectl logs --all-containers=true --all-pods=true --follow job/${JOB_NAME}
- name: Retrieve Kubernetes job status
shell: bash -exo pipefail {0}
run: |
while readarray -d : -t status < <(kubectl get job/${JOB_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
failure=${status[0]:-0}
success=${status[1]:-0}
total=$((failure+success))
if [[ ${total} < 2 ]]; then
sleep 1
elif [[ ${total} == 2 ]]; then
break
else
# FIXME
exit 255
fi
done
exit ${failure}
# Provide more debug output in case of failure; note that some kinds of launch
# failure do not produce any log output.
- name: Debug failed Kubernetes job
if: failure()
run: |
# Provide better debug in case of launch failures that will not produce log output
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} -o name)
if [[ -n "${pods}" ]]; then
kubectl describe ${pods}
fi
# Clean up in case of errors as well as success
- name: Delete Kubernetes job
if: always()
run: kubectl delete -f .github/eks-workflow-files/maxtext-job.yaml
- name: Delete GitHub Container Registry token
if: always()
run: kubectl delete secret ${TOKEN_NAME}
13 changes: 10 additions & 3 deletions .github/workflows/ngc-release-testing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
docker run -i --shm-size=1g --gpus all \
${{ inputs.JAX_IMAGE }} \
bash <<"EOF" |& tee test-backend-independent.log
test-jax.sh -b backend-independent
test-jax.sh -b backend-independent
EOF
docker run -i --shm-size=1g --gpus all \
${{ inputs.JAX_IMAGE }} \
Expand Down Expand Up @@ -80,8 +80,15 @@ jobs:
MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
secrets: inherit

test-maxtext-eks:
if: inputs.MAXTEXT_IMAGE != ''
uses: ./.github/workflows/_test_maxtext_k8s.yaml
with:
MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
secrets: inherit

finalize:
needs: [ test-nccl, test-jax, test-rosetta-pax, test-maxtext ]
needs: [ test-nccl, test-jax, test-rosetta-pax, test-maxtext, test-maxtext-eks ]
if: "!cancelled()"
uses: ./.github/workflows/_finalize.yaml
secrets: inherit
secrets: inherit

0 comments on commit 481e71b

Please sign in to comment.