Skip to content

Commit

Permalink
Test
Browse files Browse the repository at this point in the history
  • Loading branch information
mbobrovskyi authored and IrvingMg committed Jan 8, 2025
1 parent 5ee9555 commit b3134c4
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 86 deletions.
160 changes: 83 additions & 77 deletions .github/workflows/build_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ on:
options:
- v4-8
- v5litepod-8
push:
branches: ["main"]
pull_request: # By default this runs for types assigned, opened and synchronize.
# push:
# branches: ["main"]
# pull_request: # By default this runs for types assigned, opened and synchronize.

env:
# Names must be unique in parallel running tests.
Expand All @@ -36,77 +36,83 @@ env:
WORKLOAD_NAME: xpktest-build-${{ github.run_attempt }}
PATHWAYS_WORKLOAD_NAME: xpkpw-build-${{ github.run_attempt }}
CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}} --maintenance-window=23:50"
RUN_ID: "pr-${{ github.event.number }}"
PROJECT_ID: ${{secrets.PROJECT_NAME}}
A3_MEGA_TEST_CLUSTER_NAME: "xpk-mega-ctk-int"
A3_ULTRA_TEST_CLUSTER_NAME: "xpk-ultra-ctk-int"
GKE_ML_TEST_CLUSTER_NAME: "xpk-gke-ml"
ZONE: us-central2-a
REGION: us-central2
ZONE: europe-west4-b
REGION: europe-west4

jobs:
run-unit-tests:
runs-on: [ubuntu-22.04]
concurrency: # We support one build or nightly test to run at a time currently.
group: build-test-cluster-group
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run : make install-dev
- name: Run unit tests
run: make run-unittests

run-integration-tests:
runs-on: [ubuntu-22.04]
needs: [run-unit-tests]
concurrency: # We support one build or nightly test to run at a time currently.
group: build-test-cluster-group
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Verify gcp setup
run: gcloud info
- name: Install dependencies
run : make install-dev
- name: "Set auth cidr"
run: echo "AUTH_CIDR=$(curl api.ipify.org)/32" >> $GITHUB_ENV
- name: "Set GCLOUD_CFG_PATH"
run: echo "GCLOUD_CFG_PATH=/home/runner/work/xpk/xpk/" >> $GITHUB_ENV
- name: "Copy credentials"
run: cp $GOOGLE_APPLICATION_CREDENTIALS $GCLOUD_CFG_PATH/application_default_credentials.json
- name: "Set DEPLOYMENT_DIR"
run: echo "DEPLOYMENT_DIR=$HOME/deployment" >> $GITHUB_ENV
- name: Create deployment dir
run: mkdir -p $DEPLOYMENT_DIR
- name: Run integration tests
run: make run-integrationtests
# run-unit-tests:
# runs-on: [ubuntu-22.04]
# concurrency: # We support one build or nightly test to run at a time currently.
# group: build-test-cluster-group
# cancel-in-progress: false
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# with:
# python-version: '3.10'
# - name: Install dependencies
# run : make install-dev
# - name: Run unit tests
# run: make run-unittests
#
# run-integration-tests:
# runs-on: [ubuntu-22.04]
# needs: [run-unit-tests]
# concurrency: # We support one build or nightly test to run at a time currently.
# group: build-test-cluster-group
# cancel-in-progress: false
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# with:
# python-version: '3.10'
# - uses: 'google-github-actions/auth@v2'
# with:
# credentials_json: '${{ secrets.GCP_SA_KEY }}'
# - uses: google-github-actions/setup-gcloud@v2
# with:
# version: '>= 363.0.0'
# install_components: 'beta,gke-gcloud-auth-plugin'
# - name: Verify gcp setup
# run: gcloud info
# - name: Install dependencies
# run : make install-dev
# - name: "Set auth cidr"
# run: echo "AUTH_CIDR=$(curl api.ipify.org)/32" >> $GITHUB_ENV
# - name: "Set GCLOUD_CFG_PATH"
# run: echo "GCLOUD_CFG_PATH=/home/runner/work/xpk/xpk/" >> $GITHUB_ENV
# - name: "Copy credentials"
# run: cp $GOOGLE_APPLICATION_CREDENTIALS $GCLOUD_CFG_PATH/application_default_credentials.json
# - name: "Set DEPLOYMENT_DIR"
# run: echo "DEPLOYMENT_DIR=$HOME/deployment" >> $GITHUB_ENV
# - name: Create deployment dir
# run: mkdir -p $DEPLOYMENT_DIR
# - name: Run integration tests
# run: make run-integrationtests

cluster-create-and-delete:
runs-on: [ubuntu-22.04]
needs: [run-integration-tests]
# needs: [run-integration-tests]
concurrency: # We support one nightly test and one build test for each branch to run at a time currently.
group: build-test-cluster-group-${{ github.ref }}
cancel-in-progress: false
steps:
- name: Change RUN_ID env var if merge to main
run: echo "RUN_ID=main" >> $GITHUB_ENV
if: ${{ github.ref == 'refs/heads/main' }}
- name: Initialize RUN_ID env var
run: |
if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
RUN_ID="dispatch"
elif [ "${{ github.ref }}" == "refs/heads/main" ]; then
RUN_ID="main"
else
RUN_ID="pr-${{ github.event.number }}"
fi
echo "RUN_ID=$RUN_ID" >> $GITHUB_ENV
- name: Update cluster name with TPU_TYPE and RUN_ID
run: echo "TPU_CLUSTER_NAME=$TPU_CLUSTER_NAME-$TPU_TYPE-$RUN_ID" >> $GITHUB_ENV
run: echo "TPU_CLUSTER_NAME=$TPU_CLUSTER_NAME-$RUN_ID" >> $GITHUB_ENV
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
Expand All @@ -133,40 +139,40 @@ jobs:
- name: Check xpk installation
run: xpk --help
- name: Create a private Pathways-enabled XPK Cluster with 2x $TPU_TYPE nodepools. Larger num-nodes to avoid master resizing.
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --private --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
- name: Verify the created cluster is private
run: gcloud container clusters describe $TPU_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --tpu-type=$TPU_TYPE --num-slices=1 --zone=europe-west4-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=1 --reservation='${{ secrets.GCP_TPU_V5_RESERVATION }}'
# - name: Verify the created cluster is private
# run: gcloud container clusters describe $TPU_CLUSTER_NAME --region=europe-west4 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a base-docker-image workload
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=1 --zone=europe-west4-b
- name: Run xpk inspector with the workload created above
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --workload $WORKLOAD_NAME
- name: Wait for workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
- name: Run a Pathways workload on Ubuntu base image
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=1 --zone=europe-west4-b --command "echo \"Hello world from a test script! \""
- name: Wait for Pathways workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
- name: List out the workloads on the cluster
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
- name: Run xpk info
run: python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee output.txt | grep -P "^(?=.*QUEUE)(?=.*PENDING_WORKLOADS)(?=.*ADMITTED_WORKLOADS)(?=.*2x$TPU_TYPE:google.com/tpu)(?=.*cpu-rm:cpu)(?=.*cpu-rm:memory)(?=.*cpu-proxy:cpu)(?=.*cpu-proxy:memory)(?=.*cpu-user:cpu)(?=.*cpu-user:memory)" || (echo 'Invalid command output' && cat output.txt && exit 1)
run: python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | tee output.txt | grep -P "^(?=.*QUEUE)(?=.*PENDING_WORKLOADS)(?=.*ADMITTED_WORKLOADS)(?=.*2x$TPU_TYPE:google.com/tpu)(?=.*cpu-rm:cpu)(?=.*cpu-rm:memory)(?=.*cpu-proxy:cpu)(?=.*cpu-proxy:memory)(?=.*cpu-user:cpu)(?=.*cpu-user:memory)" || (echo 'Invalid command output' && cat output.txt && exit 1)
- name: Delete the workload on the cluster
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
- name: Delete the Pathways workload on the cluster
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b
- name: Create test script to execute in batch
run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh
- name: Run a batch job on the cluster
run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3 --time 60
run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3 --time 60
- name: List out the jobs on the cluster
run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-'
run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep 'xpk-def-app-profile-slurm-'
- name: Get created job name
run: |
JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
- name: Check created job
run: |
Expand All @@ -179,7 +185,7 @@ jobs:
run: python3 xpk.py job info ${JOB_NAME} | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
- name: Cancel the batch job on the cluster
run: |
python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep "job.batch/${JOB_NAME} deleted"
python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b | grep "job.batch/${JOB_NAME} deleted"
- name: Create shell and exit it immediately
run: |
cat <<'EOF' >> create-shell.exp
Expand All @@ -197,7 +203,7 @@ jobs:
run: python3 xpk.py shell stop
- name: Delete the cluster created
if: always()
run: echo 'y' | python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=europe-west4-b --force



Expand Down
9 changes: 5 additions & 4 deletions .github/workflows/lint_and_format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@
name: Lint and Format

on:
pull_request:
push:
branches:
- main
workflow_dispatch:
# pull_request:
# push:
# branches:
# - main

jobs:
build-and-test:
Expand Down
1 change: 1 addition & 0 deletions src/xpk/commands/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,7 @@ def run_gke_cluster_create_command(
f' --num-nodes {args.default_pool_cpu_num_nodes}'
f' {args.custom_cluster_arguments}'
f' {rapid_release_cmd}'
' --verbosity=debug'
)

enable_ip_alias = False
Expand Down
2 changes: 1 addition & 1 deletion src/xpk/core/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
children.append(
# subprocess managed by list pylint: disable=consider-using-with
subprocess.Popen(
command, stdout=output_logs[i], stderr=output_logs[i], shell=True
command, stdout=sys.stdout, stderr=sys.stderr, shell=True
)
)

Expand Down
4 changes: 2 additions & 2 deletions src/xpk/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,7 +1277,7 @@ def run_gke_node_pool_create_command(
if node_pool_name in node_pools_to_remain:
continue
command = (
'gcloud beta container node-pools create'
'gcloud beta container node-pools create --verbosity=debug'
f' {node_pool_name}'
f' --region={zone_to_region(args.zone)}'
f' --cluster={args.cluster}'
Expand Down Expand Up @@ -1342,7 +1342,7 @@ def run_gke_node_pool_create_command(
if node_pool_name in existing_node_pool_names:
continue
command = (
'gcloud beta container node-pools create'
'gcloud beta container node-pools create --verbosity=debug'
f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
' --min-nodes=1 --max-nodes=20'
Expand Down
4 changes: 2 additions & 2 deletions src/xpk/core/system_characteristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1107,9 +1107,9 @@ def get_system_characteristics_by_device_type(
# v5litepod
'v5litepod-8': SystemCharacteristics(
'2x4',
1,
2,
'tpu-v5-lite-podslice',
'ct5lp-hightpu-8t',
'ct5lp-hightpu-4t',
8,
AcceleratorType['TPU'],
'v5litepod-8',
Expand Down

0 comments on commit b3134c4

Please sign in to comment.