From 7daba8d44753479aab38e3dd4494c93c2219ed5d Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Sun, 1 Sep 2024 17:14:07 -0700 Subject: [PATCH 01/10] add skaffold --- .gitignore | 1 + skaffold.yaml | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 skaffold.yaml diff --git a/.gitignore b/.gitignore index fccb76fd..4ef3dce6 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ go.work # Files that might be committed from running guides /helm-values.yaml +Chart.lock # Ignore python virtual env .venv diff --git a/skaffold.yaml b/skaffold.yaml new file mode 100644 index 00000000..65f83eee --- /dev/null +++ b/skaffold.yaml @@ -0,0 +1,23 @@ +apiVersion: skaffold/v2beta21 +kind: Config +metadata: + name: kubeai-project +build: + artifacts: + - image: substratusai/kubeai + # Additional build configuration (e.g., Dockerfile location) goes here. +deploy: + helm: + releases: + - name: kubeai + chartPath: charts/kubeai + valuesFiles: + - charts/kubeai/values.yaml + setValueTemplates: + image.tag: "{{.DIGEST_HEX}}" +portForward: +- resourceType: service + resourceName: kubeai + namespace: default + port: 80 + localPort: 8000 \ No newline at end of file From 0ab6d8f9aa2ad0dce0739ea4cade3e3d4bca585c Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Sun, 1 Sep 2024 23:52:49 -0700 Subject: [PATCH 02/10] add e2e quickstart test --- .github/workflows/tests.yml | 22 +++++++++++ test/quickstart.sh | 79 +++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 test/quickstart.sh diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f485308a..b3e00766 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -21,3 +21,25 @@ jobs: - name: Run unit tests run: make test + e2e-quickstart: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Install kind + run: | + curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.24.0/kind-linux-amd64 + chmod +x ./kind + sudo mv ./kind /usr/local/bin/kind + - name: Install helm + run: | + curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 get_helm.sh + ./get_helm.sh + - name: Install skaffold + run: | + curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64 + chmod +x skaffold + sudo mv skaffold /usr/local/bin + - name: Run the quickstart tests + run: test/quickstart.sh diff --git a/test/quickstart.sh b/test/quickstart.sh new file mode 100644 index 00000000..d042077f --- /dev/null +++ b/test/quickstart.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash + +set -xe + +if kind get clusters | grep -q substratus-test; then + echo "Cluster substratus-tests already exists.. reusing it" + else + kind create cluster --name substratus-test +fi + +error_handler() { + local exit_status=$? # Capture the exit status of the last command + if [ $exit_status -ne 0 ]; then + echo "An error occurred. Exiting with status $exit_status. Leaving kind cluster intact for debugging" + elif [ "$TEST_CLEANUP" != "false" ]; then + echo "Exiting normally. Deleting kind cluster" + kind delete cluster --name=substratus-test + fi +} + +trap 'error_handler' ERR EXIT + + +# Capture PID and run skaffold devin background +skaffold dev & +skaffold_pid=$! + +# Get the helm release name +release_name=$(helm list -n default | grep substratus | awk '{print $1}') + +# wait for kubeai pod to be ready +while ! kubectl get pod -l app.kubernetes.io/name=kubeai | grep -q Running; do + sleep 5 + if (( SECONDS >= 300 )); then + echo "kubeai pod did not start in time" + exit 1 + fi +done +kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/name=kubeai \ + --timeout=300s + +# Ensure the model count is 0 +curl -s -X GET "http://localhost:8000/openai/v1/models" | jq '. | length == 0' + + +helm upgrade --reuse-values --install kubeai charts/kubeai -f - <= 600 )); then + echo "gemma 2 2b pod did not start in time" + exit 1 + fi +done +kubectl wait --for=condition=ready pod \ + -l model=gemma2-2b-cpu \ + --timeout=600s + +curl -s -X GET "http://localhost:8000/openai/v1/models" | jq '. | length == 3' + +curl http://localhost:8080/openai/v1/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "gemma2-2b-cpu", "prompt": "Who was the first president of the United States?", "max_tokens": 40}' + +# Send exit signal to skaffold and wait for it to exit +kill "$skaffold_pid" +wait "$skaffold_pid" From 27f0700c34396712065097893ca1c5d28a9c7916 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Sun, 1 Sep 2024 23:53:43 -0700 Subject: [PATCH 03/10] make script executable --- test/quickstart.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 test/quickstart.sh diff --git a/test/quickstart.sh b/test/quickstart.sh old mode 100644 new mode 100755 From d6279cafdc2d6da81788872a0601d57edc527fb9 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Mon, 2 Sep 2024 00:01:13 -0700 Subject: [PATCH 04/10] fix test url and ensure always exit skaffold --- test/quickstart.sh | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/test/quickstart.sh b/test/quickstart.sh index d042077f..c99dc4ea 100755 --- a/test/quickstart.sh +++ b/test/quickstart.sh @@ -2,36 +2,40 @@ set -xe -if kind get clusters | grep -q substratus-test; then - echo "Cluster substratus-tests already exists.. reusing it" +KIND_CLUSTER_NAME=${KIND_CLUSTER_NAME:-kubeai-tests} + +if kind get clusters | grep -q ${KIND_CLUSTER_NAME}; then + echo "Cluster ${KIND_CLUSTER_NAME} already exists.. reusing it" else - kind create cluster --name substratus-test + kind create cluster --name ${KIND_CLUSTER_NAME} fi +# Capture PID and run skaffold devin background +skaffold dev & +skaffold_pid=$! + error_handler() { local exit_status=$? # Capture the exit status of the last command if [ $exit_status -ne 0 ]; then echo "An error occurred. Exiting with status $exit_status. Leaving kind cluster intact for debugging" elif [ "$TEST_CLEANUP" != "false" ]; then echo "Exiting normally. Deleting kind cluster" - kind delete cluster --name=substratus-test + kind delete cluster --name=${KIND_CLUSTER_NAME} fi + # Send exit signal to skaffold and wait for it to exit + kill "$skaffold_pid" + wait "$skaffold_pid" } trap 'error_handler' ERR EXIT - -# Capture PID and run skaffold devin background -skaffold dev & -skaffold_pid=$! - # Get the helm release name release_name=$(helm list -n default | grep substratus | awk '{print $1}') # wait for kubeai pod to be ready while ! kubectl get pod -l app.kubernetes.io/name=kubeai | grep -q Running; do sleep 5 - if (( SECONDS >= 300 )); then + if (( SECONDS >= 600 )); then echo "kubeai pod did not start in time" exit 1 fi @@ -70,10 +74,7 @@ kubectl wait --for=condition=ready pod \ curl -s -X GET "http://localhost:8000/openai/v1/models" | jq '. | length == 3' -curl http://localhost:8080/openai/v1/completions \ +curl http://localhost:8000/openai/v1/completions \ -H "Content-Type: application/json" \ -d '{"model": "gemma2-2b-cpu", "prompt": "Who was the first president of the United States?", "max_tokens": 40}' -# Send exit signal to skaffold and wait for it to exit -kill "$skaffold_pid" -wait "$skaffold_pid" From 7973ee3ff69ce55dbcddc603f04beaa43afc0579 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Mon, 2 Sep 2024 00:05:34 -0700 Subject: [PATCH 05/10] increase timeout of gemma 2b to 900s --- test/quickstart.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/quickstart.sh b/test/quickstart.sh index c99dc4ea..8b2c0ae5 100755 --- a/test/quickstart.sh +++ b/test/quickstart.sh @@ -35,7 +35,7 @@ release_name=$(helm list -n default | grep substratus | awk '{print $1}') # wait for kubeai pod to be ready while ! kubectl get pod -l app.kubernetes.io/name=kubeai | grep -q Running; do sleep 5 - if (( SECONDS >= 600 )); then + if (( SECONDS >= 900 )); then echo "kubeai pod did not start in time" exit 1 fi From 063166951e781779269ecab0d0cb9acb4b4f7540 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Mon, 2 Sep 2024 00:13:16 -0700 Subject: [PATCH 06/10] create function wait_for_pod_ready --- skaffold.yaml | 1 - test/quickstart.sh | 41 +++++++++++++++++------------------------ 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/skaffold.yaml b/skaffold.yaml index 65f83eee..b0d80b5d 100644 --- a/skaffold.yaml +++ b/skaffold.yaml @@ -5,7 +5,6 @@ metadata: build: artifacts: - image: substratusai/kubeai - # Additional build configuration (e.g., Dockerfile location) goes here. deploy: helm: releases: diff --git a/test/quickstart.sh b/test/quickstart.sh index 8b2c0ae5..c147a53a 100755 --- a/test/quickstart.sh +++ b/test/quickstart.sh @@ -11,7 +11,7 @@ if kind get clusters | grep -q ${KIND_CLUSTER_NAME}; then fi # Capture PID and run skaffold devin background -skaffold dev & +skaffold run --tail --port-forward & skaffold_pid=$! error_handler() { @@ -29,20 +29,23 @@ error_handler() { trap 'error_handler' ERR EXIT -# Get the helm release name -release_name=$(helm list -n default | grep substratus | awk '{print $1}') +function wait_for_pod_ready() { + local label="$1" + local start_time=$SECONDS + + while ! kubectl get pod -l "$label" | grep -q Running; do + sleep 5 + if (( SECONDS - start_time >= 300 )); then + echo "Pods with label $label did not start in time." + exit 1 + fi + done + + kubectl wait --for=condition=ready pod -l "$label" --timeout=600s +} # wait for kubeai pod to be ready -while ! kubectl get pod -l app.kubernetes.io/name=kubeai | grep -q Running; do - sleep 5 - if (( SECONDS >= 900 )); then - echo "kubeai pod did not start in time" - exit 1 - fi -done -kubectl wait --for=condition=ready pod \ - -l app.kubernetes.io/name=kubeai \ - --timeout=300s +wait_for_pod_ready app.kubernetes.io/name=kubeai # Ensure the model count is 0 curl -s -X GET "http://localhost:8000/openai/v1/models" | jq '. | length == 0' @@ -60,17 +63,7 @@ models: enabled: true EOF - -while ! kubectl get pod -l model=gemma2-2b-cpu | grep -q Running; do - sleep 5 - if (( SECONDS >= 600 )); then - echo "gemma 2 2b pod did not start in time" - exit 1 - fi -done -kubectl wait --for=condition=ready pod \ - -l model=gemma2-2b-cpu \ - --timeout=600s +wait_for_pod_ready model=gemma2-2b-cpu curl -s -X GET "http://localhost:8000/openai/v1/models" | jq '. | length == 3' From 92a2e88d3d62e6390a59b3fb815144b6d19ade0a Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Mon, 2 Sep 2024 00:30:17 -0700 Subject: [PATCH 07/10] increase timeout to 1200s --- test/quickstart.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/quickstart.sh b/test/quickstart.sh index c147a53a..99b508ee 100755 --- a/test/quickstart.sh +++ b/test/quickstart.sh @@ -41,7 +41,7 @@ function wait_for_pod_ready() { fi done - kubectl wait --for=condition=ready pod -l "$label" --timeout=600s + kubectl wait --for=condition=ready pod -l "$label" --timeout=1200s } # wait for kubeai pod to be ready From f00c042abdb23e365bac760f3cbbeacbffd093c4 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Mon, 2 Sep 2024 08:47:43 -0700 Subject: [PATCH 08/10] explicitely set push to false --- skaffold.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skaffold.yaml b/skaffold.yaml index b0d80b5d..4ada207e 100644 --- a/skaffold.yaml +++ b/skaffold.yaml @@ -5,6 +5,8 @@ metadata: build: artifacts: - image: substratusai/kubeai + local: + push: false deploy: helm: releases: From 2aa667fb653055a7cfe40c06d421fd3d64b5e278 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Mon, 2 Sep 2024 09:17:39 -0700 Subject: [PATCH 09/10] update comments --- test/quickstart.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/quickstart.sh b/test/quickstart.sh index 99b508ee..f812806d 100755 --- a/test/quickstart.sh +++ b/test/quickstart.sh @@ -10,7 +10,7 @@ if kind get clusters | grep -q ${KIND_CLUSTER_NAME}; then kind create cluster --name ${KIND_CLUSTER_NAME} fi -# Capture PID and run skaffold devin background +# Capture PID and run skaffold in background. skaffold run --tail --port-forward & skaffold_pid=$! @@ -22,7 +22,7 @@ error_handler() { echo "Exiting normally. Deleting kind cluster" kind delete cluster --name=${KIND_CLUSTER_NAME} fi - # Send exit signal to skaffold and wait for it to exit + # Send exit signal to skaffold and wait for it to exit. kill "$skaffold_pid" wait "$skaffold_pid" } @@ -44,13 +44,13 @@ function wait_for_pod_ready() { kubectl wait --for=condition=ready pod -l "$label" --timeout=1200s } -# wait for kubeai pod to be ready +# wait for kubeai pod to be ready. wait_for_pod_ready app.kubernetes.io/name=kubeai -# Ensure the model count is 0 +# Ensure the model count is 0. curl -s -X GET "http://localhost:8000/openai/v1/models" | jq '. | length == 0' - +# Reuse values is needed to ensure the skaffold build image is used. helm upgrade --reuse-values --install kubeai charts/kubeai -f - < Date: Mon, 2 Sep 2024 09:25:52 -0700 Subject: [PATCH 10/10] update comments --- test/quickstart.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/quickstart.sh b/test/quickstart.sh index f812806d..efcff641 100755 --- a/test/quickstart.sh +++ b/test/quickstart.sh @@ -50,7 +50,9 @@ wait_for_pod_ready app.kubernetes.io/name=kubeai # Ensure the model count is 0. curl -s -X GET "http://localhost:8000/openai/v1/models" | jq '. | length == 0' -# Reuse values is needed to ensure the skaffold build image is used. +# By using the --reuse-values flag we can just append models to the previous install +# while avoiding overriding the image that skaffold originally built and set in the +# first install. helm upgrade --reuse-values --install kubeai charts/kubeai -f - <