-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
Signed-off-by: Luca Giorgi <lgiorgi@redhat.com>
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
apiVersion: v1 | ||
kind: Namespace | ||
metadata: | ||
name: vllm-gpt2 | ||
--- | ||
apiVersion: v1 | ||
kind: PersistentVolumeClaim | ||
metadata: | ||
name: vlmm-gpt2-claim | ||
namespace: vllm-gpt2 | ||
spec: | ||
accessModes: | ||
- ReadWriteOnce | ||
volumeMode: Filesystem | ||
resources: | ||
requests: | ||
storage: 10Gi | ||
--- | ||
apiVersion: v1 | ||
kind: Pod | ||
metadata: | ||
name: setup-gpt2-binary | ||
namespace: vllm-gpt2 | ||
labels: | ||
gpt-download-pod: 'true' | ||
spec: | ||
Check warning Code scanning / SonarCloud Service account tokens should not be mounted in pods Medium test
Set automountServiceAccountToken to false for this specification of kind Pod. See more on SonarCloud
|
||
volumes: | ||
- name: model-volume | ||
persistentVolumeClaim: | ||
claimName: vlmm-gpt2-claim | ||
restartPolicy: Never | ||
initContainers: | ||
- name: fix-volume-permissions | ||
image: quay.io/quay/busybox:latest | ||
imagePullPolicy: IfNotPresent | ||
securityContext: | ||
allowPrivilegeEscalation: true | ||
resources: | ||
requests: | ||
memory: "64Mi" | ||
cpu: "250m" | ||
nvidia.com/gpu: "1" | ||
limits: | ||
memory: "128Mi" | ||
cpu: "500m" | ||
nvidia.com/gpu: "1" | ||
command: ["sh"] | ||
args: ["-c", "chown -R 1001:1001 /mnt/models"] | ||
volumeMounts: | ||
- mountPath: "/mnt/models/" | ||
name: model-volume | ||
containers: | ||
- name: download-model | ||
Check warning Code scanning / SonarCloud Storage limits should be enforced Medium test
Specify a storage limit for this container. See more on SonarCloud
|
||
image: registry.access.redhat.com/ubi9/python-311:latest | ||
imagePullPolicy: IfNotPresent | ||
securityContext: | ||
allowPrivilegeEscalation: true | ||
resources: | ||
requests: | ||
memory: "1Gi" | ||
cpu: "1" | ||
nvidia.com/gpu: "1" | ||
limits: | ||
memory: "1Gi" | ||
cpu: "1" | ||
nvidia.com/gpu: "1" | ||
command: ["sh"] | ||
args: [ "-c", "pip install --upgrade pip && pip install --upgrade huggingface_hub && python3 -c 'from huggingface_hub import snapshot_download\nsnapshot_download(repo_id=\"gpt2\", local_dir=\"/mnt/models/gpt2\", local_dir_use_symlinks=False)'"] | ||
volumeMounts: | ||
- mountPath: "/mnt/models/" | ||
name: model-volume | ||
env: | ||
- name: TRANSFORMERS_CACHE | ||
value: /tmp |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
{ | ||
"model": "gpt2", | ||
"messages": [ | ||
{ | ||
"role": "system", | ||
"content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair." | ||
}, | ||
{ | ||
"role": "user", | ||
"content": "Compose a poem that explains the concept of recursion in programming." | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
apiVersion: serving.kserve.io/v1beta1 | ||
kind: InferenceService | ||
metadata: | ||
name: vllm-gpt2-openai | ||
namespace: vllm-gpt2 | ||
labels: | ||
modelmesh-enabled: "true" | ||
spec: | ||
predictor: | ||
model: | ||
runtime: kserve-vllm | ||
modelFormat: | ||
name: vLLM | ||
storageUri: pvc://vlmm-gpt2-claim/ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
apiVersion: serving.kserve.io/v1alpha1 | ||
kind: ServingRuntime | ||
metadata: | ||
name: kserve-vllm | ||
namespace: vllm-gpt2 | ||
spec: | ||
annotations: | ||
sidecar.istio.io/inject: "true" | ||
sidecar.istio.io/rewriteAppHTTPProbers: "true" | ||
serving.knative.openshift.io/enablePassthrough: "true" | ||
opendatahub.io/dashboard: "true" | ||
openshift.io/display-name: "vLLLM Openai entry point" | ||
serving.kserve.io/enable-prometheus-scraping: "false" | ||
serving.kserve.io/enable-metric-aggregation: "true" | ||
prometheus.io/port: '8080' | ||
prometheus.io/path: "/metrics/" | ||
multiModel: false | ||
supportedModelFormats: | ||
- name: vLLM | ||
autoSelect: true | ||
containers: | ||
- name: kserve-container | ||
#image: kserve/vllmserver:latest | ||
image: quay.io/wxpe/tgis-vllm:release.74803b6 | ||
startupProbe: | ||
httpGet: | ||
port: 8080 | ||
path: /health | ||
# Allow 12 minutes to start | ||
failureThreshold: 24 | ||
periodSeconds: 30 | ||
readinessProbe: | ||
httpGet: | ||
port: 8080 | ||
path: /health | ||
periodSeconds: 30 | ||
timeoutSeconds: 5 | ||
livenessProbe: | ||
httpGet: | ||
port: 8080 | ||
path: /health | ||
periodSeconds: 100 | ||
timeoutSeconds: 8 | ||
terminationMessagePolicy: "FallbackToLogsOnError" | ||
terminationGracePeriodSeconds: 120 | ||
args: | ||
- --port | ||
- "8080" | ||
- --model | ||
- /mnt/models/gpt2 | ||
- --served-model-name | ||
- "gpt2" | ||
command: | ||
- python3 | ||
- -m | ||
- vllm.entrypoints.openai.api_server | ||
env: | ||
- name: STORAGE_URI | ||
value: pvc://vlmm-gpt2-claim/ | ||
- name: HF_HUB_CACHE | ||
value: /tmp | ||
- name: TRANSFORMERS_CACHE | ||
value: $(HF_HUB_CACHE) | ||
- name: NUM_GPUS | ||
value: "1" | ||
- name: CUDA_VISIBLE_DEVICES | ||
value: "0" | ||
ports: | ||
- containerPort: 8080 | ||
protocol: TCP | ||
resources: | ||
limits: | ||
cpu: "4" | ||
memory: 8Gi | ||
nvidia.com/gpu: "1" | ||
requests: | ||
cpu: "1" | ||
memory: 4Gi | ||
nvidia.com/gpu: "1" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
*** Settings *** | ||
Documentation Basic vLLM deploy test to validate metrics being correctly exposed in OpenShift | ||
Resource ../../../../../Resources/Page/ODH/ODHDashboard/ODHModelServing.resource | ||
Resource ../../../../../Resources/OCP.resource | ||
Resource ../../../../../Resources/Page/Operators/ISVs.resource | ||
Resource ../../../../../Resources/Page/ODH/ODHDashboard/ODHDashboardAPI.resource | ||
Resource ../../../../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource | ||
Resource ../../../../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/DataConnections.resource | ||
Resource ../../../../../Resources/CLI/ModelServing/llm.resource | ||
Resource ../../../../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Permissions.resource | ||
Library OpenShiftLibrary | ||
Suite Setup Suite Setup | ||
Suite Teardown Suite Teardown | ||
Test Tags KServe | ||
|
||
|
||
*** Variables *** | ||
${VLLM_RESOURCES_DIRPATH}= ods_ci/tests/Resources/Files/llm/vllm | ||
${DL_POD_FILEPATH}= ${VLLM_RESOURCES_DIRPATH}/download_model.yaml | ||
${SR_FILEPATH}= ${VLLM_RESOURCES_DIRPATH}/vllm_servingruntime.yaml | ||
${IS_FILEPATH}= ${VLLM_RESOURCES_DIRPATH}/vllm-gpt2_inferenceservice.yaml | ||
${INFERENCE_INPUT}= @${VLLM_RESOURCES_DIRPATH}/query.json | ||
${INFERENCE_URL}= http://localhost:8080/v1/chat/completions | ||
${METRICS_URL}= http://localhost:8080/metrics/ | ||
${TEST_NS}= vllm-gpt2 | ||
@{SEARCH_METRICS}= vllm:cache_config_info | ||
... vllm:num_requests_running | ||
... vllm:num_requests_swapped | ||
... vllm:num_requests_waiting | ||
... vllm:gpu_cache_usage_perc | ||
... vllm:cpu_cache_usage_perc | ||
... vllm:prompt_tokens_total | ||
... vllm:generation_tokens_total | ||
... vllm:time_to_first_token_seconds_bucket | ||
... vllm:time_to_first_token_seconds_count | ||
... vllm:time_to_first_token_seconds_sum | ||
... vllm:time_per_output_token_seconds_bucket | ||
... vllm:time_per_output_token_seconds_count | ||
... vllm:time_per_output_token_seconds_sum | ||
... vllm:e2e_request_latency_seconds_bucket | ||
... vllm:e2e_request_latency_seconds_count | ||
... vllm:e2e_request_latency_seconds_sum | ||
... vllm:avg_prompt_throughput_toks_per_s | ||
... vllm:avg_generation_throughput_toks_per_s | ||
|
||
|
||
*** Test Cases *** | ||
Verify User Can Deploy A Model With Vllm Via CLI | ||
Check warning Code scanning / Robocop Test case '{{ test_name }}' has too many keywords inside ({{ keyword_count }}/{{ max_allowed_count }}) Warning test
Test case 'Verify User Can Deploy A Model With Vllm Via CLI' has too many keywords inside (13/10)
|
||
[Documentation] Deploy a model (gpt2) using the vllm runtime and confirm that it's running | ||
[Tags] Tier1 Sanity Resources-GPU ODS-XXX | ||
${rc} ${out}= Run And Return Rc And Output oc apply -f ${DL_POD_FILEPATH} | ||
Check warning Code scanning / Robocop Local variable '{{ name }}' is overwritten before usage Warning test
Local variable '${out}' is overwritten before usage
|
||
Should Be Equal As Integers ${rc} ${0} | ||
Wait For Pods To Succeed label_selector=gpt-download-pod=true namespace=${TEST_NS} | ||
${rc} ${out}= Run And Return Rc And Output oc apply -f ${SR_FILEPATH} | ||
Check warning Code scanning / Robocop Local variable '{{ name }}' is overwritten before usage Warning test
Local variable '${out}' is overwritten before usage
|
||
Should Be Equal As Integers ${rc} ${0} | ||
${rc} ${out}= Run And Return Rc And Output oc apply -f ${IS_FILEPATH} | ||
Check warning Code scanning / Robocop Local variable '{{ name }}' is overwritten before usage Warning test
Local variable '${out}' is overwritten before usage
|
||
Should Be Equal As Integers ${rc} ${0} | ||
Wait For Pods To Be Ready label_selector=serving.kserve.io/inferenceservice=vllm-gpt2-openai | ||
... namespace=${TEST_NS} | ||
${pod_name}= Get Pod Name namespace=${TEST_NS} | ||
... label_selector=serving.kserve.io/inferenceservice=vllm-gpt2-openai | ||
Start Port-forwarding namespace=${TEST_NS} pod_name=${pod_name} local_port=8080 remote_port=8080 | ||
${rc} ${out}= Run And Return Rc And Output | ||
... curl -ks ${INFERENCE_URL} -H "Content-Type: application/json" -d ${INFERENCE_INPUT} | jq . | ||
Should Be Equal As Integers ${rc} ${0} | ||
Log ${out} | ||
|
||
Verify Vllm Metrics Are Present | ||
[Documentation] Confirm vLLM metrics are exposed in OpenShift metrics | ||
[Tags] Tier1 Sanity Resources-GPU ODS-XXX | ||
${rc} ${out}= Run And Return Rc And Output | ||
... curl -ks ${METRICS_URL} | ||
Should Be Equal As Integers ${rc} ${0} | ||
Log ${out} | ||
${thanos_url}= Get OpenShift Thanos URL | ||
${token}= Generate Thanos Token | ||
Metrics Should Exist In UserWorkloadMonitoring ${thanos_url} ${token} ${SEARCH_METRICS} | ||
|
||
|
||
*** Keywords *** | ||
Suite Setup | ||
Check warning Code scanning / Robocop Missing documentation in '{{ name }}' keyword Warning test
Missing documentation in 'Suite Setup' keyword
|
||
Skip If Component Is Not Enabled kserve | ||
RHOSi Setup | ||
Set Default Storage Class In GCP default=ssd-csi | ||
${is_self_managed}= Is RHODS Self-Managed | ||
IF ${is_self_managed} | ||
Configure User Workload Monitoring | ||
Enable User Workload Monitoring | ||
END | ||
|
||
Suite Teardown | ||
Check warning Code scanning / Robocop Missing documentation in '{{ name }}' keyword Warning test
Missing documentation in 'Suite Teardown' keyword
Check warning Code scanning / Robocop Keyword '{{ keyword_name }}' has too many keywords inside ({{ keyword_count }}/{{ max_allowed_count }}) Warning test
Keyword 'Suite Teardown' has too many keywords inside (11/10)
|
||
Set Default Storage Class In GCP default=standard-csi | ||
Terminate Process llm-query-process kill=true | ||
${rc}= Run And Return Rc oc delete inferenceservice -n ${TEST_NS} --all | ||
Should Be Equal As Integers ${rc} ${0} | ||
${rc}= Run And Return Rc oc delete servingruntime -n ${TEST_NS} --all | ||
Should Be Equal As Integers ${rc} ${0} | ||
${rc}= Run And Return Rc oc delete pod -n ${TEST_NS} --all | ||
Should Be Equal As Integers ${rc} ${0} | ||
${rc}= Run And Return Rc oc delete namespace ${TEST_NS} | ||
Should Be Equal As Integers ${rc} ${0} | ||
RHOSi Teardown | ||
|
||
Set Default Storage Class In GCP | ||
[Documentation] If the storage class exists we can assume we are in GCP. We force ssd-csi to be the default class | ||
... for the duration of this test suite. | ||
[Arguments] ${default} | ||
${rc}= Run And Return Rc oc get storageclass ${default} | ||
IF ${rc} == ${0} | ||
IF "${default}" == "ssd-csi" | ||
Run oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' #robocop: disable | ||
Run oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' #robocop: disable | ||
ELSE | ||
Run oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' #robocop: disable | ||
Run oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' #robocop: disable | ||
END | ||
ELSE | ||
Log Proceeding with default storage class because we're not in GCP | ||
END |