From fff7959fdd2044736d221fd9b92b881cb2683223 Mon Sep 17 00:00:00 2001 From: Luca Giorgi Date: Fri, 10 May 2024 17:44:17 +0200 Subject: [PATCH 1/6] Add base test for vLLM and its metrics Signed-off-by: Luca Giorgi --- .../Resources/CLI/ModelServing/llm.resource | 5 +- .../Files/llm/vllm/download_model.yaml | 74 +++++++++++ .../tests/Resources/Files/llm/vllm/query.json | 13 ++ .../llm/vllm/vllm-gpt2_inferenceservice.yaml | 14 +++ .../Files/llm/vllm/vllm_servingruntime.yaml | 79 ++++++++++++ .../426__model_serving_vllm_metrics.robot | 119 ++++++++++++++++++ 6 files changed, 302 insertions(+), 2 deletions(-) create mode 100644 ods_ci/tests/Resources/Files/llm/vllm/download_model.yaml create mode 100644 ods_ci/tests/Resources/Files/llm/vllm/query.json create mode 100644 ods_ci/tests/Resources/Files/llm/vllm/vllm-gpt2_inferenceservice.yaml create mode 100644 ods_ci/tests/Resources/Files/llm/vllm/vllm_servingruntime.yaml create mode 100644 ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot diff --git a/ods_ci/tests/Resources/CLI/ModelServing/llm.resource b/ods_ci/tests/Resources/CLI/ModelServing/llm.resource index 2b502357d..48d1c935e 100644 --- a/ods_ci/tests/Resources/CLI/ModelServing/llm.resource +++ b/ods_ci/tests/Resources/CLI/ModelServing/llm.resource @@ -775,8 +775,9 @@ Get KServe Default Deployment Mode From DSC RETURN ${mode} Start Port-forwarding - [Arguments] ${namespace} ${pod_name} ${process_alias}=llm-query-process - ${process}= Start Process oc -n ${namespace} port-forward pod/${pod_name} 8033:8033 + [Arguments] ${namespace} ${pod_name} ${process_alias}=llm-query-process ${local_port}=8033 + ... ${remote_port}=8033 + ${process}= Start Process oc -n ${namespace} port-forward pod/${pod_name} ${local_port}:${remote_port} ... alias=${process_alias} stderr=STDOUT shell=yes Process Should Be Running ${process} sleep 7s diff --git a/ods_ci/tests/Resources/Files/llm/vllm/download_model.yaml b/ods_ci/tests/Resources/Files/llm/vllm/download_model.yaml new file mode 100644 index 000000000..c7851289c --- /dev/null +++ b/ods_ci/tests/Resources/Files/llm/vllm/download_model.yaml @@ -0,0 +1,74 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: vllm-gpt2 +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vlmm-gpt2-claim + namespace: vllm-gpt2 +spec: + accessModes: + - ReadWriteOnce + volumeMode: Filesystem + resources: + requests: + storage: 10Gi +--- +apiVersion: v1 +kind: Pod +metadata: + name: setup-gpt2-binary + namespace: vllm-gpt2 + labels: + gpt-download-pod: 'true' +spec: + volumes: + - name: model-volume + persistentVolumeClaim: + claimName: vlmm-gpt2-claim + restartPolicy: Never + initContainers: + - name: fix-volume-permissions + image: quay.io/quay/busybox:latest + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: true + resources: + requests: + memory: "64Mi" + cpu: "250m" + nvidia.com/gpu: "1" + limits: + memory: "128Mi" + cpu: "500m" + nvidia.com/gpu: "1" + command: ["sh"] + args: ["-c", "chown -R 1001:1001 /mnt/models"] + volumeMounts: + - mountPath: "/mnt/models/" + name: model-volume + containers: + - name: download-model + image: registry.access.redhat.com/ubi9/python-311:latest + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: true + resources: + requests: + memory: "1Gi" + cpu: "1" + nvidia.com/gpu: "1" + limits: + memory: "1Gi" + cpu: "1" + nvidia.com/gpu: "1" + command: ["sh"] + args: [ "-c", "pip install --upgrade pip && pip install --upgrade huggingface_hub && python3 -c 'from huggingface_hub import snapshot_download\nsnapshot_download(repo_id=\"gpt2\", local_dir=\"/mnt/models/gpt2\", local_dir_use_symlinks=False)'"] + volumeMounts: + - mountPath: "/mnt/models/" + name: model-volume + env: + - name: TRANSFORMERS_CACHE + value: /tmp \ No newline at end of file diff --git a/ods_ci/tests/Resources/Files/llm/vllm/query.json b/ods_ci/tests/Resources/Files/llm/vllm/query.json new file mode 100644 index 000000000..156795eda --- /dev/null +++ b/ods_ci/tests/Resources/Files/llm/vllm/query.json @@ -0,0 +1,13 @@ +{ + "model": "gpt2", + "messages": [ + { + "role": "system", + "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair." + }, + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming." + } + ] +} \ No newline at end of file diff --git a/ods_ci/tests/Resources/Files/llm/vllm/vllm-gpt2_inferenceservice.yaml b/ods_ci/tests/Resources/Files/llm/vllm/vllm-gpt2_inferenceservice.yaml new file mode 100644 index 000000000..6ef4bbda2 --- /dev/null +++ b/ods_ci/tests/Resources/Files/llm/vllm/vllm-gpt2_inferenceservice.yaml @@ -0,0 +1,14 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: vllm-gpt2-openai + namespace: vllm-gpt2 + labels: + modelmesh-enabled: "true" +spec: + predictor: + model: + runtime: kserve-vllm + modelFormat: + name: vLLM + storageUri: pvc://vlmm-gpt2-claim/ \ No newline at end of file diff --git a/ods_ci/tests/Resources/Files/llm/vllm/vllm_servingruntime.yaml b/ods_ci/tests/Resources/Files/llm/vllm/vllm_servingruntime.yaml new file mode 100644 index 000000000..4d02cdcd0 --- /dev/null +++ b/ods_ci/tests/Resources/Files/llm/vllm/vllm_servingruntime.yaml @@ -0,0 +1,79 @@ +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + name: kserve-vllm + namespace: vllm-gpt2 +spec: + annotations: + sidecar.istio.io/inject: "true" + sidecar.istio.io/rewriteAppHTTPProbers: "true" + serving.knative.openshift.io/enablePassthrough: "true" + opendatahub.io/dashboard: "true" + openshift.io/display-name: "vLLLM Openai entry point" + serving.kserve.io/enable-prometheus-scraping: "false" + serving.kserve.io/enable-metric-aggregation: "true" + prometheus.io/port: '8080' + prometheus.io/path: "/metrics/" + multiModel: false + supportedModelFormats: + - name: vLLM + autoSelect: true + containers: + - name: kserve-container + #image: kserve/vllmserver:latest + image: quay.io/wxpe/tgis-vllm:release.74803b6 + startupProbe: + httpGet: + port: 8080 + path: /health + # Allow 12 minutes to start + failureThreshold: 24 + periodSeconds: 30 + readinessProbe: + httpGet: + port: 8080 + path: /health + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + httpGet: + port: 8080 + path: /health + periodSeconds: 100 + timeoutSeconds: 8 + terminationMessagePolicy: "FallbackToLogsOnError" + terminationGracePeriodSeconds: 120 + args: + - --port + - "8080" + - --model + - /mnt/models/gpt2 + - --served-model-name + - "gpt2" + command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + env: + - name: STORAGE_URI + value: pvc://vlmm-gpt2-claim/ + - name: HF_HUB_CACHE + value: /tmp + - name: TRANSFORMERS_CACHE + value: $(HF_HUB_CACHE) + - name: NUM_GPUS + value: "1" + - name: CUDA_VISIBLE_DEVICES + value: "0" + ports: + - containerPort: 8080 + protocol: TCP + resources: + limits: + cpu: "4" + memory: 8Gi + nvidia.com/gpu: "1" + requests: + cpu: "1" + memory: 4Gi + nvidia.com/gpu: "1" diff --git a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot new file mode 100644 index 000000000..85e701c76 --- /dev/null +++ b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot @@ -0,0 +1,119 @@ +*** Settings *** +Documentation Basic vLLM deploy test to validate metrics being correctly exposed in OpenShift +Resource ../../../../../Resources/Page/ODH/ODHDashboard/ODHModelServing.resource +Resource ../../../../../Resources/OCP.resource +Resource ../../../../../Resources/Page/Operators/ISVs.resource +Resource ../../../../../Resources/Page/ODH/ODHDashboard/ODHDashboardAPI.resource +Resource ../../../../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource +Resource ../../../../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/DataConnections.resource +Resource ../../../../../Resources/CLI/ModelServing/llm.resource +Resource ../../../../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Permissions.resource +Library OpenShiftLibrary +Suite Setup Suite Setup +Suite Teardown Suite Teardown +Test Tags KServe + + +*** Variables *** +${VLLM_RESOURCES_DIRPATH}= ods_ci/tests/Resources/Files/llm/vllm +${DL_POD_FILEPATH}= ${VLLM_RESOURCES_DIRPATH}/download_model.yaml +${SR_FILEPATH}= ${VLLM_RESOURCES_DIRPATH}/vllm_servingruntime.yaml +${IS_FILEPATH}= ${VLLM_RESOURCES_DIRPATH}/vllm-gpt2_inferenceservice.yaml +${INFERENCE_INPUT}= @${VLLM_RESOURCES_DIRPATH}/query.json +${INFERENCE_URL}= http://localhost:8080/v1/chat/completions +${METRICS_URL}= http://localhost:8080/metrics/ +${TEST_NS}= vllm-gpt2 +@{SEARCH_METRICS}= vllm:cache_config_info +... vllm:num_requests_running +... vllm:num_requests_swapped +... vllm:num_requests_waiting +... vllm:gpu_cache_usage_perc +... vllm:cpu_cache_usage_perc +... vllm:prompt_tokens_total +... vllm:generation_tokens_total +... vllm:time_to_first_token_seconds_bucket +... vllm:time_to_first_token_seconds_count +... vllm:time_to_first_token_seconds_sum +... vllm:time_per_output_token_seconds_bucket +... vllm:time_per_output_token_seconds_count +... vllm:time_per_output_token_seconds_sum +... vllm:e2e_request_latency_seconds_bucket +... vllm:e2e_request_latency_seconds_count +... vllm:e2e_request_latency_seconds_sum +... vllm:avg_prompt_throughput_toks_per_s +... vllm:avg_generation_throughput_toks_per_s + + +*** Test Cases *** +Verify User Can Deploy A Model With Vllm Via CLI + [Documentation] Deploy a model (gpt2) using the vllm runtime and confirm that it's running + [Tags] Tier1 Sanity Resources-GPU ODS-XXX + ${rc} ${out}= Run And Return Rc And Output oc apply -f ${DL_POD_FILEPATH} + Should Be Equal As Integers ${rc} ${0} + Wait For Pods To Succeed label_selector=gpt-download-pod=true namespace=${TEST_NS} + ${rc} ${out}= Run And Return Rc And Output oc apply -f ${SR_FILEPATH} + Should Be Equal As Integers ${rc} ${0} + ${rc} ${out}= Run And Return Rc And Output oc apply -f ${IS_FILEPATH} + Should Be Equal As Integers ${rc} ${0} + Wait For Pods To Be Ready label_selector=serving.kserve.io/inferenceservice=vllm-gpt2-openai + ... namespace=${TEST_NS} + ${pod_name}= Get Pod Name namespace=${TEST_NS} + ... label_selector=serving.kserve.io/inferenceservice=vllm-gpt2-openai + Start Port-forwarding namespace=${TEST_NS} pod_name=${pod_name} local_port=8080 remote_port=8080 + ${rc} ${out}= Run And Return Rc And Output + ... curl -ks ${INFERENCE_URL} -H "Content-Type: application/json" -d ${INFERENCE_INPUT} | jq . + Should Be Equal As Integers ${rc} ${0} + Log ${out} + +Verify Vllm Metrics Are Present + [Documentation] Confirm vLLM metrics are exposed in OpenShift metrics + [Tags] Tier1 Sanity Resources-GPU ODS-XXX + ${rc} ${out}= Run And Return Rc And Output + ... curl -ks ${METRICS_URL} + Should Be Equal As Integers ${rc} ${0} + Log ${out} + ${thanos_url}= Get OpenShift Thanos URL + ${token}= Generate Thanos Token + Metrics Should Exist In UserWorkloadMonitoring ${thanos_url} ${token} ${SEARCH_METRICS} + + +*** Keywords *** +Suite Setup + Skip If Component Is Not Enabled kserve + RHOSi Setup + Set Default Storage Class In GCP default=ssd-csi + ${is_self_managed}= Is RHODS Self-Managed + IF ${is_self_managed} + Configure User Workload Monitoring + Enable User Workload Monitoring + END + +Suite Teardown + Set Default Storage Class In GCP default=standard-csi + Terminate Process llm-query-process kill=true + ${rc}= Run And Return Rc oc delete inferenceservice -n ${TEST_NS} --all + Should Be Equal As Integers ${rc} ${0} + ${rc}= Run And Return Rc oc delete servingruntime -n ${TEST_NS} --all + Should Be Equal As Integers ${rc} ${0} + ${rc}= Run And Return Rc oc delete pod -n ${TEST_NS} --all + Should Be Equal As Integers ${rc} ${0} + ${rc}= Run And Return Rc oc delete namespace ${TEST_NS} + Should Be Equal As Integers ${rc} ${0} + RHOSi Teardown + +Set Default Storage Class In GCP + [Documentation] If the storage class exists we can assume we are in GCP. We force ssd-csi to be the default class + ... for the duration of this test suite. + [Arguments] ${default} + ${rc}= Run And Return Rc oc get storageclass ${default} + IF ${rc} == ${0} + IF "${default}" == "ssd-csi" + Run oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' #robocop: disable + Run oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' #robocop: disable + ELSE + Run oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' #robocop: disable + Run oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' #robocop: disable + END + ELSE + Log Proceeding with default storage class because we're not in GCP + END From ddaea0648d25417bc87090ac89250e47bf67c24b Mon Sep 17 00:00:00 2001 From: Luca Giorgi Date: Mon, 13 May 2024 14:42:52 +0200 Subject: [PATCH 2/6] Update ods_ci/tests/Resources/Files/llm/vllm/vllm_servingruntime.yaml Co-authored-by: Vedant Mahabaleshwarkar --- ods_ci/tests/Resources/Files/llm/vllm/vllm_servingruntime.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/ods_ci/tests/Resources/Files/llm/vllm/vllm_servingruntime.yaml b/ods_ci/tests/Resources/Files/llm/vllm/vllm_servingruntime.yaml index 4d02cdcd0..15309d851 100644 --- a/ods_ci/tests/Resources/Files/llm/vllm/vllm_servingruntime.yaml +++ b/ods_ci/tests/Resources/Files/llm/vllm/vllm_servingruntime.yaml @@ -10,8 +10,6 @@ spec: serving.knative.openshift.io/enablePassthrough: "true" opendatahub.io/dashboard: "true" openshift.io/display-name: "vLLLM Openai entry point" - serving.kserve.io/enable-prometheus-scraping: "false" - serving.kserve.io/enable-metric-aggregation: "true" prometheus.io/port: '8080' prometheus.io/path: "/metrics/" multiModel: false From 787048ed5a277574c44e2aa78d34191989c2796f Mon Sep 17 00:00:00 2001 From: Luca Giorgi Date: Tue, 14 May 2024 18:28:20 +0200 Subject: [PATCH 3/6] reimplement using common keywords Signed-off-by: Luca Giorgi --- .../Files/llm/model_expected_responses.json | 13 ++++++++++++ .../Files/llm/runtime_query_formats.json | 17 ++++++++++++++++ .../426__model_serving_vllm_metrics.robot | 20 +++++++++---------- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/ods_ci/tests/Resources/Files/llm/model_expected_responses.json b/ods_ci/tests/Resources/Files/llm/model_expected_responses.json index 996c2ed02..c38dfeb7b 100644 --- a/ods_ci/tests/Resources/Files/llm/model_expected_responses.json +++ b/ods_ci/tests/Resources/Files/llm/model_expected_responses.json @@ -146,6 +146,19 @@ } } } + }, + { + "query_text": "{'role': 'system','content': 'You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.'},{'role': 'user','content': 'Compose a poem that explains the concept of recursion in programming.'}", + "models": { + "gpt2": { + "completion_tokens": 992, + "response_text": "A friend of mine came over to the house to play with his wife. He was asleep, and he felt like he'd been hit by a speeding car. He's a big guy. He's like the kind of guy who may not have a very good head, but he's big enough to stand up at a table and read something. I was like, \"I'm going to play with this.\"\n\nThat's where I started playing with my car. It was something I never dreamed of doing, but I'd never imagined that it would be such a big deal.\n\nWe started playing with it. When I was about 12, we started playing with it to see how it would turn out. I was 26, and I was playing it for the first time for the first time ever. It was fun. I remember thinking it was like a different game than I had ever played before. I remember thinking the first time we played would be like, \"Oh my god, I've never played a game like this before before.\"\n\nIt was surreal. I was in my 20s at the time. We got to have a party in my house at the time. I was sitting in the living room with my friend, who's 28. We're from Dallas, and his wife is a pretty big girl. He's about 6 feet tall and 250 pounds. On the phone with his friend said, \"Dad, is it possible you'll be able to do this without your beard?\" I was like, \"Absolutely, actually.\" I thought, \"I'm going to do it.\"\n\nI finally did it and it turned out pretty well. I was able to take our photo with our friend, and he got excited and started laughing. He was like, \"That's awesome.\" I sat in his living room for two hours and made sure he was really excited. He was really excited. We ended up having a workshop and we have a lot of stuff to do.\n\nHe just started playing. It's been amazing. I'm like, \"It's going to be huge.\" At first I was like, \"Wow, my god that's amazing.\" I was like, \"Wow, my God that's awesome.\" He's like, \"I'm so excited about this!\" He was like, \"Oh my god, I can't wait to do it!\"\n\nHe had that awesome physique. He was super skinny. He was like, \"I'm so excited about it.\" He was like, \"Really?\" I was like, \"Yeah, I'm so excited! I'm so excited.\" We did it for two weeks and it turned out pretty well.\n\nHe's like, \"I hope it stays that way.\" I was like, \"I hope it stays that way.\" He was like, \"Oh my god, I've never even played with a computer before!\" I was like, \"Yeah, it's just fun to play with a computer.\" He was like, \"Oh my god, I can't wait to play with a computer!\" He was like, \"It's just a cool thing to do!\"\n\nI was doing it with my friend's dog, a puppy.\n\nI was doing it with my friend's dog. People said, \"You think that's cool?\" I said, \"Yeah, that's cool.\" We had the dog. He was a little bit shy and it was a little bit intimidating and scary.\n\nWe played it twice. It was like a game. He was like, \"Oh my God I've never played with a computer before!\" I was like, \"I hope it stays that way.\" He was like, \"Yeah, it's just a cool thing to do!\" He was like, \"Oh my god, I can't wait to do it!\"\n\nWe played it again on the bus, on the weekend.\n\nWe played it again on the weekend.\n\nThen we went to the store and bought a new Canon 5D Mark II.\n\nI couldn't believe what the customer was saying. I was like, \"That sounds amazing!\" He was like, \"That's amazing!\"\n\nHe was like, \"Wow! That's awesome!\" So we were like, \"Wow! That looks awesome!\" He's like, \"Yeah, that looks awesome!\" I was like, \"Wow! That looks awesome! That looks awesome!\"\n\nWe played it twice again.\n\nI was like, \"Wow! That sounds awesome!\" He was like, \"Wow! That sounds awesome! That sounds awesome!\" I was like, \"Wow! That looks awesome!\"\n\nHe was like, \"Wow! That sounds awesome! That looks awesome!\"\n\nI was just like, \"Wow! That looks awesome! That looks awesome!\" He was like", + "streamed_response_text": "", + "vllm": { + "chat-completions_response_text": "" + } + } + } } ], "model-info": { diff --git a/ods_ci/tests/Resources/Files/llm/runtime_query_formats.json b/ods_ci/tests/Resources/Files/llm/runtime_query_formats.json index 17c9c1ec5..f17a0db3f 100644 --- a/ods_ci/tests/Resources/Files/llm/runtime_query_formats.json +++ b/ods_ci/tests/Resources/Files/llm/runtime_query_formats.json @@ -96,5 +96,22 @@ } }, "containers": ["kserve-container"] + }, + "vllm": { + "endpoints": { + "chat-completions": { + "http": { + "endpoint": "v1/chat/completions", + "header": "Content-Type:application/json", + "body": "{'model': '${model_name}','messages': [${query_text}]}", + "response_fields_map": { + "response": "choices", + "completion_tokens": "completion_tokens", + "response_text": "content" + } + } + } + }, + "containers": ["kserve-container"] } } diff --git a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot index 85e701c76..35b0cc421 100644 --- a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot +++ b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot @@ -53,23 +53,19 @@ Verify User Can Deploy A Model With Vllm Via CLI Wait For Pods To Succeed label_selector=gpt-download-pod=true namespace=${TEST_NS} ${rc} ${out}= Run And Return Rc And Output oc apply -f ${SR_FILEPATH} Should Be Equal As Integers ${rc} ${0} - ${rc} ${out}= Run And Return Rc And Output oc apply -f ${IS_FILEPATH} - Should Be Equal As Integers ${rc} ${0} + Deploy Model Via CLI ${IS_FILEPATH} ${TEST_NS} Wait For Pods To Be Ready label_selector=serving.kserve.io/inferenceservice=vllm-gpt2-openai ... namespace=${TEST_NS} - ${pod_name}= Get Pod Name namespace=${TEST_NS} - ... label_selector=serving.kserve.io/inferenceservice=vllm-gpt2-openai - Start Port-forwarding namespace=${TEST_NS} pod_name=${pod_name} local_port=8080 remote_port=8080 - ${rc} ${out}= Run And Return Rc And Output - ... curl -ks ${INFERENCE_URL} -H "Content-Type: application/json" -d ${INFERENCE_INPUT} | jq . - Should Be Equal As Integers ${rc} ${0} - Log ${out} + Query Model Multiple Times model_name=gpt2 isvc_name=vllm-gpt2-openai runtime=vllm protocol=http + ... inference_type=chat-completions n_times=3 query_idx=8 + ... namespace=${TEST_NS} string_check_only=${TRUE} Verify Vllm Metrics Are Present [Documentation] Confirm vLLM metrics are exposed in OpenShift metrics [Tags] Tier1 Sanity Resources-GPU ODS-XXX + ${host} = llm.Get KServe Inference Host Via CLI isvc_name=vllm-gpt2-openai namespace=${TEST_NS} ${rc} ${out}= Run And Return Rc And Output - ... curl -ks ${METRICS_URL} + ... curl -ks ${host}/metrics/ Should Be Equal As Integers ${rc} ${0} Log ${out} ${thanos_url}= Get OpenShift Thanos URL @@ -86,11 +82,13 @@ Suite Setup IF ${is_self_managed} Configure User Workload Monitoring Enable User Workload Monitoring + #TODO: Find reliable signal for UWM being ready + #Sleep 10m END + Load Expected Responses Suite Teardown Set Default Storage Class In GCP default=standard-csi - Terminate Process llm-query-process kill=true ${rc}= Run And Return Rc oc delete inferenceservice -n ${TEST_NS} --all Should Be Equal As Integers ${rc} ${0} ${rc}= Run And Return Rc oc delete servingruntime -n ${TEST_NS} --all From 4d813e2ea136f60d21f6ff995bab5dad9cd53c85 Mon Sep 17 00:00:00 2001 From: Luca Giorgi Date: Wed, 15 May 2024 16:31:53 +0200 Subject: [PATCH 4/6] Better handle missing metrics from UWM, change expected response format Signed-off-by: Luca Giorgi --- .../Resources/Files/llm/model_expected_responses.json | 5 +---- .../Resources/Page/ODH/Monitoring/Monitoring.resource | 10 +++++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/ods_ci/tests/Resources/Files/llm/model_expected_responses.json b/ods_ci/tests/Resources/Files/llm/model_expected_responses.json index c38dfeb7b..4616259fc 100644 --- a/ods_ci/tests/Resources/Files/llm/model_expected_responses.json +++ b/ods_ci/tests/Resources/Files/llm/model_expected_responses.json @@ -151,11 +151,8 @@ "query_text": "{'role': 'system','content': 'You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.'},{'role': 'user','content': 'Compose a poem that explains the concept of recursion in programming.'}", "models": { "gpt2": { - "completion_tokens": 992, - "response_text": "A friend of mine came over to the house to play with his wife. He was asleep, and he felt like he'd been hit by a speeding car. He's a big guy. He's like the kind of guy who may not have a very good head, but he's big enough to stand up at a table and read something. I was like, \"I'm going to play with this.\"\n\nThat's where I started playing with my car. It was something I never dreamed of doing, but I'd never imagined that it would be such a big deal.\n\nWe started playing with it. When I was about 12, we started playing with it to see how it would turn out. I was 26, and I was playing it for the first time for the first time ever. It was fun. I remember thinking it was like a different game than I had ever played before. I remember thinking the first time we played would be like, \"Oh my god, I've never played a game like this before before.\"\n\nIt was surreal. I was in my 20s at the time. We got to have a party in my house at the time. I was sitting in the living room with my friend, who's 28. We're from Dallas, and his wife is a pretty big girl. He's about 6 feet tall and 250 pounds. On the phone with his friend said, \"Dad, is it possible you'll be able to do this without your beard?\" I was like, \"Absolutely, actually.\" I thought, \"I'm going to do it.\"\n\nI finally did it and it turned out pretty well. I was able to take our photo with our friend, and he got excited and started laughing. He was like, \"That's awesome.\" I sat in his living room for two hours and made sure he was really excited. He was really excited. We ended up having a workshop and we have a lot of stuff to do.\n\nHe just started playing. It's been amazing. I'm like, \"It's going to be huge.\" At first I was like, \"Wow, my god that's amazing.\" I was like, \"Wow, my God that's awesome.\" He's like, \"I'm so excited about this!\" He was like, \"Oh my god, I can't wait to do it!\"\n\nHe had that awesome physique. He was super skinny. He was like, \"I'm so excited about it.\" He was like, \"Really?\" I was like, \"Yeah, I'm so excited! I'm so excited.\" We did it for two weeks and it turned out pretty well.\n\nHe's like, \"I hope it stays that way.\" I was like, \"I hope it stays that way.\" He was like, \"Oh my god, I've never even played with a computer before!\" I was like, \"Yeah, it's just fun to play with a computer.\" He was like, \"Oh my god, I can't wait to play with a computer!\" He was like, \"It's just a cool thing to do!\"\n\nI was doing it with my friend's dog, a puppy.\n\nI was doing it with my friend's dog. People said, \"You think that's cool?\" I said, \"Yeah, that's cool.\" We had the dog. He was a little bit shy and it was a little bit intimidating and scary.\n\nWe played it twice. It was like a game. He was like, \"Oh my God I've never played with a computer before!\" I was like, \"I hope it stays that way.\" He was like, \"Yeah, it's just a cool thing to do!\" He was like, \"Oh my god, I can't wait to do it!\"\n\nWe played it again on the bus, on the weekend.\n\nWe played it again on the weekend.\n\nThen we went to the store and bought a new Canon 5D Mark II.\n\nI couldn't believe what the customer was saying. I was like, \"That sounds amazing!\" He was like, \"That's amazing!\"\n\nHe was like, \"Wow! That's awesome!\" So we were like, \"Wow! That looks awesome!\" He's like, \"Yeah, that looks awesome!\" I was like, \"Wow! That looks awesome! That looks awesome!\"\n\nWe played it twice again.\n\nI was like, \"Wow! That sounds awesome!\" He was like, \"Wow! That sounds awesome! That sounds awesome!\" I was like, \"Wow! That looks awesome!\"\n\nHe was like, \"Wow! That sounds awesome! That looks awesome!\"\n\nI was just like, \"Wow! That looks awesome! That looks awesome!\" He was like", - "streamed_response_text": "", "vllm": { - "chat-completions_response_text": "" + "chat-completions_response_text": "A friend of mine came over to the house to play with his wife. He was asleep, and he felt like he'd been hit by a speeding car. He's a big guy. He's like the kind of guy who may not have a very good head, but he's big enough to stand up at a table and read something. I was like, \"I'm going to play with this.\"\n\nThat's where I started playing with my car. It was something I never dreamed of doing, but I'd never imagined that it would be such a big deal.\n\nWe started playing with it. When I was about 12, we started playing with it to see how it would turn out. I was 26, and I was playing it for the first time for the first time ever. It was fun. I remember thinking it was like a different game than I had ever played before. I remember thinking the first time we played would be like, \"Oh my god, I've never played a game like this before before.\"\n\nIt was surreal. I was in my 20s at the time. We got to have a party in my house at the time. I was sitting in the living room with my friend, who's 28. We're from Dallas, and his wife is a pretty big girl. He's about 6 feet tall and 250 pounds. On the phone with his friend said, \"Dad, is it possible you'll be able to do this without your beard?\" I was like, \"Absolutely, actually.\" I thought, \"I'm going to do it.\"\n\nI finally did it and it turned out pretty well. I was able to take our photo with our friend, and he got excited and started laughing. He was like, \"That's awesome.\" I sat in his living room for two hours and made sure he was really excited. He was really excited. We ended up having a workshop and we have a lot of stuff to do.\n\nHe just started playing. It's been amazing. I'm like, \"It's going to be huge.\" At first I was like, \"Wow, my god that's amazing.\" I was like, \"Wow, my God that's awesome.\" He's like, \"I'm so excited about this!\" He was like, \"Oh my god, I can't wait to do it!\"\n\nHe had that awesome physique. He was super skinny. He was like, \"I'm so excited about it.\" He was like, \"Really?\" I was like, \"Yeah, I'm so excited! I'm so excited.\" We did it for two weeks and it turned out pretty well.\n\nHe's like, \"I hope it stays that way.\" I was like, \"I hope it stays that way.\" He was like, \"Oh my god, I've never even played with a computer before!\" I was like, \"Yeah, it's just fun to play with a computer.\" He was like, \"Oh my god, I can't wait to play with a computer!\" He was like, \"It's just a cool thing to do!\"\n\nI was doing it with my friend's dog, a puppy.\n\nI was doing it with my friend's dog. People said, \"You think that's cool?\" I said, \"Yeah, that's cool.\" We had the dog. He was a little bit shy and it was a little bit intimidating and scary.\n\nWe played it twice. It was like a game. He was like, \"Oh my God I've never played with a computer before!\" I was like, \"I hope it stays that way.\" He was like, \"Yeah, it's just a cool thing to do!\" He was like, \"Oh my god, I can't wait to do it!\"\n\nWe played it again on the bus, on the weekend.\n\nWe played it again on the weekend.\n\nThen we went to the store and bought a new Canon 5D Mark II.\n\nI couldn't believe what the customer was saying. I was like, \"That sounds amazing!\" He was like, \"That's amazing!\"\n\nHe was like, \"Wow! That's awesome!\" So we were like, \"Wow! That looks awesome!\" He's like, \"Yeah, that looks awesome!\" I was like, \"Wow! That looks awesome! That looks awesome!\"\n\nWe played it twice again.\n\nI was like, \"Wow! That sounds awesome!\" He was like, \"Wow! That sounds awesome! That sounds awesome!\" I was like, \"Wow! That looks awesome!\"\n\nHe was like, \"Wow! That sounds awesome! That looks awesome!\"\n\nI was just like, \"Wow! That looks awesome! That looks awesome!\" He was like" } } } diff --git a/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource b/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource index 7e026d1ad..6127a192c 100644 --- a/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource +++ b/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource @@ -176,9 +176,13 @@ Metrics Should Exist In UserWorkloadMonitoring Log ${index}: ${metric_search_text} ${metrics_names}= Get Thanos Metrics List thanos_url=${thanos_url} thanos_token=${thanos_token} ... search_text=${metric_search_text} - Should Not Be Empty ${metrics_names} - ${metrics_names}= Split To Lines ${metrics_names} - Append To List ${metrics} @{metrics_names} + ${found} = Run Keyword And Return Status Should Not Be Empty ${metrics_names} + IF ${found} + ${metrics_names}= Split To Lines ${metrics_names} + Append To List ${metrics} @{metrics_names} + ELSE + Run Keyword And Continue On Failure Fail msg=${metric_search_text} not found + END END RETURN ${metrics} From 6c0c3d9c999c36995281a8f0d6c61c350c047411 Mon Sep 17 00:00:00 2001 From: Luca Giorgi Date: Thu, 16 May 2024 14:12:45 +0200 Subject: [PATCH 5/6] small cleanup Signed-off-by: Luca Giorgi --- ods_ci/tests/Resources/Files/llm/vllm/query.json | 13 ------------- .../LLMs/vllm/426__model_serving_vllm_metrics.robot | 7 ++----- 2 files changed, 2 insertions(+), 18 deletions(-) delete mode 100644 ods_ci/tests/Resources/Files/llm/vllm/query.json diff --git a/ods_ci/tests/Resources/Files/llm/vllm/query.json b/ods_ci/tests/Resources/Files/llm/vllm/query.json deleted file mode 100644 index 156795eda..000000000 --- a/ods_ci/tests/Resources/Files/llm/vllm/query.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "model": "gpt2", - "messages": [ - { - "role": "system", - "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair." - }, - { - "role": "user", - "content": "Compose a poem that explains the concept of recursion in programming." - } - ] -} \ No newline at end of file diff --git a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot index 35b0cc421..755bccc55 100644 --- a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot +++ b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot @@ -19,9 +19,6 @@ ${VLLM_RESOURCES_DIRPATH}= ods_ci/tests/Resources/Files/llm/vllm ${DL_POD_FILEPATH}= ${VLLM_RESOURCES_DIRPATH}/download_model.yaml ${SR_FILEPATH}= ${VLLM_RESOURCES_DIRPATH}/vllm_servingruntime.yaml ${IS_FILEPATH}= ${VLLM_RESOURCES_DIRPATH}/vllm-gpt2_inferenceservice.yaml -${INFERENCE_INPUT}= @${VLLM_RESOURCES_DIRPATH}/query.json -${INFERENCE_URL}= http://localhost:8080/v1/chat/completions -${METRICS_URL}= http://localhost:8080/metrics/ ${TEST_NS}= vllm-gpt2 @{SEARCH_METRICS}= vllm:cache_config_info ... vllm:num_requests_running @@ -47,7 +44,7 @@ ${TEST_NS}= vllm-gpt2 *** Test Cases *** Verify User Can Deploy A Model With Vllm Via CLI [Documentation] Deploy a model (gpt2) using the vllm runtime and confirm that it's running - [Tags] Tier1 Sanity Resources-GPU ODS-XXX + [Tags] Tier1 Sanity Resources-GPU RHOAIENG-6264 ${rc} ${out}= Run And Return Rc And Output oc apply -f ${DL_POD_FILEPATH} Should Be Equal As Integers ${rc} ${0} Wait For Pods To Succeed label_selector=gpt-download-pod=true namespace=${TEST_NS} @@ -62,7 +59,7 @@ Verify User Can Deploy A Model With Vllm Via CLI Verify Vllm Metrics Are Present [Documentation] Confirm vLLM metrics are exposed in OpenShift metrics - [Tags] Tier1 Sanity Resources-GPU ODS-XXX + [Tags] Tier1 Sanity Resources-GPU RHOAIENG-6264 ${host} = llm.Get KServe Inference Host Via CLI isvc_name=vllm-gpt2-openai namespace=${TEST_NS} ${rc} ${out}= Run And Return Rc And Output ... curl -ks ${host}/metrics/ From 95093e418517b929ab03e2b904da0dfc34742898 Mon Sep 17 00:00:00 2001 From: Luca Giorgi Date: Fri, 17 May 2024 11:58:38 +0200 Subject: [PATCH 6/6] Move keyword, some cleanup, comments Signed-off-by: Luca Giorgi --- ods_ci/tests/Resources/OCP.resource | 17 ++++++++++++++ .../LLMs/422__model_serving_llm_models.robot | 19 ++-------------- .../426__model_serving_vllm_metrics.robot | 22 +++++-------------- 3 files changed, 24 insertions(+), 34 deletions(-) diff --git a/ods_ci/tests/Resources/OCP.resource b/ods_ci/tests/Resources/OCP.resource index 8b04230b2..7658f4256 100644 --- a/ods_ci/tests/Resources/OCP.resource +++ b/ods_ci/tests/Resources/OCP.resource @@ -258,3 +258,20 @@ Check If Pod Does Not Exist ${rc} ${output}= Run And Return Rc And Output ... oc get pod -l {label_selector} -n ${namespace} Should Be Equal "${rc}" "1" msg=${output} + +Set Default Storage Class In GCP + [Documentation] If the storage class exists we can assume we are in GCP. We force ssd-csi to be the default class + ... for the duration of this test suite. + [Arguments] ${default} + ${rc}= Run And Return Rc oc get storageclass ${default} + IF ${rc} == ${0} + IF "${default}" == "ssd-csi" + Run oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' #robocop: disable + Run oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' #robocop: disable + ELSE + Run oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' #robocop: disable + Run oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' #robocop: disable + END + ELSE + Log Proceeding with default storage class because we're not in GCP + END diff --git a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot index eee446ab7..493c1bcdb 100644 --- a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot +++ b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot @@ -19,6 +19,8 @@ ${KSERVE_MODE}= RawDeployment ${MODEL_FORMAT}= pytorch #vLLM ${PROTOCOL}= grpc #http ${OVERLAY}= vllm + + *** Test Cases *** Verify User Can Serve And Query A bigscience/mt0-xxl Model [Documentation] Basic tests for preparing, deploying and querying a LLM model @@ -454,23 +456,6 @@ Suite Teardown Set Default Storage Class In GCP default=standard-csi RHOSi Teardown -Set Default Storage Class In GCP - [Documentation] If the storage class exists we can assume we are in GCP. We force ssd-csi to be the default class - ... for the duration of this test suite. - [Arguments] ${default} - ${rc}= Run And Return Rc oc get storageclass ${default} - IF ${rc} == ${0} - IF "${default}" == "ssd-csi" - Run oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' #robocop: disable - Run oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' #robocop: disable - ELSE - Run oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' #robocop: disable - Run oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' #robocop: disable - END - ELSE - Log Proceeding with default storage class because we're not in GCP - END - Setup Test Variables [Arguments] ${model_name} ${kserve_mode}=Serverless ${use_pvc}=${FALSE} ${use_gpu}=${FALSE} ... ${model_path}=${model_name} diff --git a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot index 755bccc55..09afd76e3 100644 --- a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot +++ b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/vllm/426__model_serving_vllm_metrics.robot @@ -50,6 +50,10 @@ Verify User Can Deploy A Model With Vllm Via CLI Wait For Pods To Succeed label_selector=gpt-download-pod=true namespace=${TEST_NS} ${rc} ${out}= Run And Return Rc And Output oc apply -f ${SR_FILEPATH} Should Be Equal As Integers ${rc} ${0} + #TODO: Switch to common keyword for model DL and SR deploy + #Set Project And Runtime runtime=vllm namespace=${TEST_NS} + #... download_in_pvc=${DOWNLOAD_IN_PVC} model_name=gpt2 + #... storage_size=10Gi Deploy Model Via CLI ${IS_FILEPATH} ${TEST_NS} Wait For Pods To Be Ready label_selector=serving.kserve.io/inferenceservice=vllm-gpt2-openai ... namespace=${TEST_NS} @@ -60,6 +64,7 @@ Verify User Can Deploy A Model With Vllm Via CLI Verify Vllm Metrics Are Present [Documentation] Confirm vLLM metrics are exposed in OpenShift metrics [Tags] Tier1 Sanity Resources-GPU RHOAIENG-6264 + ... Depends On Test Verify User Can Deploy A Model With Vllm Via CLI ${host} = llm.Get KServe Inference Host Via CLI isvc_name=vllm-gpt2-openai namespace=${TEST_NS} ${rc} ${out}= Run And Return Rc And Output ... curl -ks ${host}/metrics/ @@ -95,20 +100,3 @@ Suite Teardown ${rc}= Run And Return Rc oc delete namespace ${TEST_NS} Should Be Equal As Integers ${rc} ${0} RHOSi Teardown - -Set Default Storage Class In GCP - [Documentation] If the storage class exists we can assume we are in GCP. We force ssd-csi to be the default class - ... for the duration of this test suite. - [Arguments] ${default} - ${rc}= Run And Return Rc oc get storageclass ${default} - IF ${rc} == ${0} - IF "${default}" == "ssd-csi" - Run oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' #robocop: disable - Run oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' #robocop: disable - ELSE - Run oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' #robocop: disable - Run oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' #robocop: disable - END - ELSE - Log Proceeding with default storage class because we're not in GCP - END