Skip to content

Commit

Permalink
Add workflow_dispatch on cluster-create-and-delete test.
Browse files Browse the repository at this point in the history
  • Loading branch information
mbobrovskyi committed Dec 17, 2024
1 parent a24a0f8 commit 80c72a5
Showing 1 changed file with 19 additions and 9 deletions.
28 changes: 19 additions & 9 deletions .github/workflows/build_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,23 @@
name: Build Tests

on:
workflow_dispatch:
inputs:
tpuType:
description: 'TPU Type'
required: true
default: 'v4-8'
type: choice
options:
- v4-8
push:
branches: ["main"]
pull_request: # By default this runs for types assigned, opened and synchronize.

env:
# Names must be unique in parallel running tests.
TPU_CLUSTER_NAME: build-xpk-2-v4-8-nodepools
TPU_TYPE: ${{ inputs.tpuType || 'v4-8' }}
TPU_CLUSTER_NAME: build-xpk-2-nodepools
WORKLOAD_NAME: xpktest-build-${{ github.run_attempt }}
PATHWAYS_WORKLOAD_NAME: xpkpw-build-${{ github.run_attempt }}
CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}} --maintenance-window=23:50"
Expand All @@ -46,7 +56,7 @@ jobs:
run : make install-dev
- name: Run unit tests
run: make run-unittests

run-integration-tests:
runs-on: [ubuntu-22.04]
needs: [run-unit-tests]
Expand Down Expand Up @@ -92,8 +102,8 @@ jobs:
- name: Change RUN_ID env var if merge to main
run: echo "RUN_ID=main" >> $GITHUB_ENV
if: ${{ github.ref == 'refs/heads/main' }}
- name: Update cluster name with RUN_ID
run: echo "TPU_CLUSTER_NAME=$TPU_CLUSTER_NAME-$RUN_ID" >> $GITHUB_ENV
- name: Update cluster name with TPU_TYPE and RUN_ID
run: echo "TPU_CLUSTER_NAME=$TPU_CLUSTER_NAME-$TPU_TYPE-$RUN_ID" >> $GITHUB_ENV
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
Expand All @@ -119,28 +129,28 @@ jobs:
echo $PWD/bin >> "$GITHUB_PATH"
- name: Check xpk installation
run: xpk --help
- name: Create a private Pathways-enabled XPK Cluster with 2x v4-8 nodepools. Larger num-nodes to avoid master resizing.
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --private --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
- name: Create a private Pathways-enabled XPK Cluster with 2x $TPU_TYPE nodepools. Larger num-nodes to avoid master resizing.
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --private --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
- name: Verify the created cluster is private
run: gcloud container clusters describe $TPU_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a base-docker-image workload
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b
- name: Run xpk inspector with the workload created above
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
- name: Wait for workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
- name: Run a Pathways workload on Ubuntu base image
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
- name: Wait for Pathways workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
- name: List out the workloads on the cluster
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Run xpk info
run: python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee output.txt | grep -P '^(?=.*QUEUE)(?=.*PENDING_WORKLOADS)(?=.*ADMITTED_WORKLOADS)(?=.*2xv4-8:google.com/tpu)(?=.*cpu-rm:cpu)(?=.*cpu-rm:memory)(?=.*cpu-proxy:cpu)(?=.*cpu-proxy:memory)(?=.*cpu-user:cpu)(?=.*cpu-user:memory)' || (echo 'Invalid command output' && cat output.txt && exit 1)
run: python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee output.txt | grep -P "^(?=.*QUEUE)(?=.*PENDING_WORKLOADS)(?=.*ADMITTED_WORKLOADS)(?=.*2x$TPU_TYPE:google.com/tpu)(?=.*cpu-rm:cpu)(?=.*cpu-rm:memory)(?=.*cpu-proxy:cpu)(?=.*cpu-proxy:memory)(?=.*cpu-user:cpu)(?=.*cpu-user:memory)" || (echo 'Invalid command output' && cat output.txt && exit 1)
- name: Delete the workload on the cluster
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Delete the Pathways workload on the cluster
Expand Down

0 comments on commit 80c72a5

Please sign in to comment.