From 063cfe2033cf5910f62917a7a0b58cb3be702a2e Mon Sep 17 00:00:00 2001 From: Raghul-M Date: Fri, 13 Dec 2024 15:06:51 +0530 Subject: [PATCH 1/4] Initial commit --- .../Resources/CLI/ModelServing/llm.resource | 1 + .../Files/llm/serving_runtimes/base/isvc.yaml | 1 + .../triton_servingruntime_http.yaml | 55 +++++ .../ODH/ODHDashboard/ODHModelServing.resource | 2 +- ...__model_serving_triton_on_kserve_api.robot | 207 ++++++++++++++++++ 5 files changed, 265 insertions(+), 1 deletion(-) create mode 100644 ods_ci/tests/Resources/Files/llm/serving_runtimes/triton_servingruntime_http.yaml create mode 100644 ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot diff --git a/ods_ci/tests/Resources/CLI/ModelServing/llm.resource b/ods_ci/tests/Resources/CLI/ModelServing/llm.resource index e131d93dd..1d70a0bf3 100644 --- a/ods_ci/tests/Resources/CLI/ModelServing/llm.resource +++ b/ods_ci/tests/Resources/CLI/ModelServing/llm.resource @@ -27,6 +27,7 @@ ${SERVICEMESH_CR_NS}= istio-system ... vllm-runtime=${LLM_RESOURCES_DIRPATH}/serving_runtimes/vllm_servingruntime_{{protocol}}.yaml ... ovms-runtime=${LLM_RESOURCES_DIRPATH}/serving_runtimes/ovms_servingruntime_{{protocol}}.yaml ... caikit-standalone-runtime=${LLM_RESOURCES_DIRPATH}/serving_runtimes/caikit_standalone_servingruntime_{{protocol}}.yaml # robocop: disable +... triton-kserve-runtime=${LLM_RESOURCES_DIRPATH}/serving_runtimes/triton_servingruntime_{{protocol}}.yaml # robocop: disable ${DOWNLOAD_PVC_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/download_model_in_pvc.yaml ${DOWNLOAD_PVC_FILLED_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/download_model_in_pvc_filled.yaml diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/base/isvc.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/base/isvc.yaml index 3dc4d26d2..8fbdabc53 100644 --- a/ods_ci/tests/Resources/Files/llm/serving_runtimes/base/isvc.yaml +++ b/ods_ci/tests/Resources/Files/llm/serving_runtimes/base/isvc.yaml @@ -20,6 +20,7 @@ spec: volumeMounts: [] modelFormat: name: ${model_format} + version: ${version} runtime: ${serving_runtime} storageUri: ${model_storage_uri} volumes: [] diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/triton_servingruntime_http.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/triton_servingruntime_http.yaml new file mode 100644 index 000000000..b7bcf8d1e --- /dev/null +++ b/ods_ci/tests/Resources/Files/llm/serving_runtimes/triton_servingruntime_http.yaml @@ -0,0 +1,55 @@ +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + name: triton-kserve-runtime +spec: + annotations: + prometheus.kserve.io/path: /metrics + prometheus.kserve.io/port: "8002" + containers: + - args: + - tritonserver + - --model-store=/mnt/models + - --grpc-port=9000 + - --http-port=8080 + - --allow-grpc=true + - --allow-http=true + image: nvcr.io/nvidia/tritonserver:23.05-py3 + name: kserve-container + resources: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "1" + memory: 2Gi + ports: + - containerPort: 8080 + protocol: TCP + protocolVersions: + - v2 + - grpc-v2 + supportedModelFormats: + - autoSelect: true + name: tensorrt + version: "8" + - autoSelect: true + name: tensorflow + version: "1" + - autoSelect: true + name: tensorflow + version: "2" + - autoSelect: true + name: onnx + version: "1" + - name: pytorch + version: "1" + - autoSelect: true + name: triton + version: "2" + - autoSelect: true + name: xgboost + version: "1" + - autoSelect: true + name: python + version: "1" diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource index 9103015f4..5a4baf790 100644 --- a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource +++ b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource @@ -339,7 +339,7 @@ Get Model Inference ${rc} ${url}= Run And Return Rc And Output ... oc get ksvc ${model_name}-predictor -n ${project_title} -o jsonpath='{.status.url}' Should Be Equal As Integers ${rc} 0 - ${curl_cmd}= Set Variable curl -s ${url}${end_point} -d ${inference_input} + ${curl_cmd}= Set Variable curl -sk ${url}${end_point} -d ${inference_input} ELSE IF '${kserve_mode}' == 'RawDeployment' ${url}= Set Variable http://localhost:${service_port}${end_point} ${curl_cmd}= Set Variable curl -s ${url} -d ${inference_input} --cacert openshift_ca_istio_knative.crt diff --git a/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot b/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot new file mode 100644 index 000000000..0a9c09726 --- /dev/null +++ b/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot @@ -0,0 +1,207 @@ +*** Settings *** +Documentation Suite of test cases for Triton in Kserve +Library OperatingSystem +Library ../../../../libs/Helpers.py +Resource ../../../Resources/Page/ODH/JupyterHub/HighAvailability.robot +Resource ../../../Resources/Page/ODH/ODHDashboard/ODHModelServing.resource +Resource ../../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Projects.resource +Resource ../../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/DataConnections.resource +Resource ../../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource +Resource ../../../Resources/Page/ODH/ODHDashboard/ODHDashboardSettingsRuntimes.resource +Resource ../../../Resources/Page/ODH/Monitoring/Monitoring.resource +Resource ../../../Resources/OCP.resource +Resource ../../../Resources/CLI/ModelServing/modelmesh.resource +Resource ../../../Resources/Common.robot +Resource ../../../Resources/CLI/ModelServing/llm.resource +Suite Setup Suite Setup +Suite Teardown Suite Teardown +Test Tags Kserve + +*** Variables *** +${PYTHON_MODEL_NAME}= python +${EXPECTED_INFERENCE_REST_OUTPUT_PYTHON}= {"model_name":"python","model_version":"1","outputs":[{"name":"OUTPUT0","datatype":"FP32","shape":[4],"data":[0.921442985534668,0.6223347187042236,0.8059385418891907,1.2578542232513428]},{"name":"OUTPUT1","datatype":"FP32","shape":[4],"data":[0.49091365933418274,-0.027157962322235107,-0.5641784071922302,0.6906309723854065]}]} +${INFERENCE_REST_INPUT_PYTHON}= @tests/Resources/Files/triton/kserve-triton-python-rest-input.json +${KSERVE_MODE}= Serverless # Serverless +${PROTOCOL}= http +${TEST_NS}= tritonmodel +${DOWNLOAD_IN_PVC}= ${FALSE} +${MODELS_BUCKET}= ${S3.BUCKET_1} +${LLM_RESOURCES_DIRPATH}= tests/Resources/Files/llm +${INFERENCESERVICE_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/serving_runtimes/base/isvc.yaml +${INFERENCESERVICE_FILEPATH_NEW}= ${LLM_RESOURCES_DIRPATH}/serving_runtimes/isvc +${INFERENCESERVICE_FILLED_FILEPATH}= ${INFERENCESERVICE_FILEPATH_NEW}/isvc_filled.yaml +${ONNX_RUNTIME_NAMEs}= triton-kserve-runtime + + +*** Test Cases *** +Test Python Model Rest Inference Via API (Triton on Kserve) # robocop: off=too-long-test-case + [Documentation] Test the deployment of python model in Kserve using Triton + [Tags] Tier2 Resources-GPU NVIDIA-GPUs RunThisTest + Setup Test Variables model_name=${PYTHON_MODEL_NAME} use_pvc=${FALSE} use_gpu=${FALSE} + ... kserve_mode=${KSERVE_MODE} model_path=triton/model_repository/ + Set Project And Runtime runtime=${ONNX_RUNTIME_NAMEs} protocol=${PROTOCOL} namespace=${test_namespace} + ... download_in_pvc=${DOWNLOAD_IN_PVC} model_name=${PYTHON_MODEL_NAME} + ... storage_size=100Mi memory_request=100Mi + ${requests}= Create Dictionary memory=1Gi + Compile Inference Service YAML isvc_name=${PYTHON_MODEL_NAME} + ... sa_name=models-bucket-sa + ... model_storage_uri=${storage_uri} + ... model_format=python serving_runtime=${ONNX_RUNTIME_NAMEs} + ... version="1" + ... limits_dict=${limits} requests_dict=${requests} kserve_mode=${KSERVE_MODE} + Deploy Model Via CLI isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH} + ... namespace=${test_namespace} + # File is not needed anymore after applying + Remove File ${INFERENCESERVICE_FILLED_FILEPATH} + Wait For Pods To Be Ready label_selector=serving.kserve.io/inferenceservice=${PYTHON_MODEL_NAME} + ... namespace=${test_namespace} + ${pod_name}= Get Pod Name namespace=${test_namespace} + ... label_selector=serving.kserve.io/inferenceservice=${PYTHON_MODEL_NAME} + ${service_port}= Extract Service Port service_name=${PYTHON_MODEL_NAME}-predictor protocol=TCP + ... namespace=${test_namespace} + IF "${KSERVE_MODE}"=="RawDeployment" + Start Port-forwarding namespace=${test_namespace} pod_name=${pod_name} local_port=${service_port} + ... remote_port=${service_port} process_alias=triton-process + END + Verify Model Inference With Retries model_name=${PYTHON_MODEL_NAME} inference_input=${INFERENCE_REST_INPUT_PYTHON} + ... expected_inference_output=${EXPECTED_INFERENCE_REST_OUTPUT_PYTHON} project_title=${test_namespace} + ... deployment_mode=Cli kserve_mode=${KSERVE_MODE} service_port=${service_port} + ... end_point=/v2/models/${model_name}/infer retries=3 + [Teardown] Run Keywords + ... Clean Up Test Project test_ns=${test_namespace} + ... isvc_names=${models_names} wait_prj_deletion=${FALSE} kserve_mode=${KSERVE_MODE} + ... AND + ... Run Keyword If "${KSERVE_MODE}"=="RawDeployment" Terminate Process triton-process kill=true + + +*** Keywords *** +Suite Setup + [Documentation] Suite setup keyword + Set Library Search Order SeleniumLibrary + Skip If Component Is Not Enabled kserve + RHOSi Setup + Load Expected Responses + Set Default Storage Class In GCP default=ssd-csi + +Suite Teardown + [Documentation] Suite teardown keyword + Set Default Storage Class In GCP default=standard-csi + RHOSi Teardown + +Setup Test Variables # robocop: off=too-many-calls-in-keyword + [Documentation] Sets up variables for the Suite + [Arguments] ${model_name} ${kserve_mode}=Serverless ${use_pvc}=${FALSE} ${use_gpu}=${FALSE} + ... ${model_path}=${model_name} + Set Test Variable ${model_name} + ${models_names}= Create List ${model_name} + Set Test Variable ${models_names} + Set Test Variable ${model_path} + Set Test Variable ${test_namespace} ${TEST_NS}-${model_name} + IF ${use_pvc} + Set Test Variable ${storage_uri} pvc://${model_name}-claim/${model_path} + ELSE + Set Test Variable ${storage_uri} s3://${S3.BUCKET_1.NAME}/${model_path} + END + IF ${use_gpu} + ${supported_gpu_type}= Convert To Lowercase ${GPU_TYPE} + Set Runtime Image ${supported_gpu_type} + IF "${supported_gpu_type}" == "nvidia" + ${limits}= Create Dictionary nvidia.com/gpu=1 + ELSE IF "${supported_gpu_type}" == "amd" + ${limits}= Create Dictionary amd.com/gpu=1 + ELSE + FAIL msg=Provided GPU type is not yet supported. Only nvidia and amd gpu type are supported + END + Set Test Variable ${limits} + ELSE + Set Test Variable ${limits} &{EMPTY} + END + IF "${KSERVE_MODE}" == "RawDeployment" # robocop: off=inconsistent-variable-name + Set Test Variable ${use_port_forwarding} ${TRUE} + ELSE + Set Test Variable ${use_port_forwarding} ${FALSE} + END + Set Log Level NONE + Set Test Variable ${access_key_id} ${S3.AWS_ACCESS_KEY_ID} + Set Test Variable ${access_key} ${S3.AWS_SECRET_ACCESS_KEY} + Set Test Variable ${endpoint} ${MODELS_BUCKET.ENDPOINT} + Set Test Variable ${region} ${MODELS_BUCKET.REGION} + Set Log Level INFO + +Set Runtime Image + [Documentation] Sets up runtime variables for the Suite + [Arguments] ${gpu_type} + IF "${RUNTIME_IMAGE}" == "${EMPTY}" + IF "${gpu_type}" == "nvidia" + Set Test Variable ${runtime_image} quay.io/modh/vllm@sha256:c86ff1e89c86bc9821b75d7f2bbc170b3c13e3ccf538bf543b1110f23e056316 + ELSE IF "${gpu_type}" == "amd" + Set Test Variable ${runtime_image} quay.io/modh/vllm@sha256:10f09eeca822ebe77e127aad7eca2571f859a5536a6023a1baffc6764bcadc6e + ELSE + FAIL msg=Provided GPU type is not yet supported. Only nvidia and amd gpu type are supported + END + ELSE + Log To Console msg= Using the image provided from terminal + END + +Compile Inference Service YAML + [Documentation] Prepare the Inference Service YAML file in order to deploy a model + [Arguments] ${isvc_name} ${model_storage_uri} ${model_format}=caikit ${serving_runtime}=caikit-tgis-runtime + ... ${kserve_mode}=${NONE} ${sa_name}=${DEFAULT_BUCKET_SA_NAME} ${canaryTrafficPercent}=${EMPTY} ${min_replicas}=1 + ... ${scaleTarget}=1 ${scaleMetric}=concurrency ${auto_scale}=${NONE} + ... ${requests_dict}=&{EMPTY} ${limits_dict}=&{EMPTY} ${overlays}=${EMPTY} ${version}=${EMPTY} + IF '${auto_scale}' == '${NONE}' + ${scaleTarget}= Set Variable ${EMPTY} + ${scaleMetric}= Set Variable ${EMPTY} + END + Set Test Variable ${isvc_name} + Set Test Variable ${min_replicas} + Set Test Variable ${sa_name} + Set Test Variable ${model_storage_uri} + Set Test Variable ${scaleTarget} + Set Test Variable ${scaleMetric} + Set Test Variable ${canaryTrafficPercent} + Set Test Variable ${model_format} + Set Test Variable ${version} + Set Test Variable ${serving_runtime} + IF len($overlays) > 0 + FOR ${index} ${overlay} IN ENUMERATE @{overlays} + Log ${index}: ${overlay} + ${rc} ${out}= Run And Return Rc And Output + ... oc kustomize ${LLM_RESOURCES_DIRPATH}/serving_runtimes/overlay/${overlay} > ${INFERENCESERVICE_FILLED_FILEPATH} + Should Be Equal As Integers ${rc} ${0} msg=${out} + END + Create File From Template ${INFERENCESERVICE_FILLED_FILEPATH} ${INFERENCESERVICE_FILLED_FILEPATH} + ELSE + Create File From Template ${INFERENCESERVICE_FILEPATH} ${INFERENCESERVICE_FILLED_FILEPATH} + END + IF ${requests_dict} != &{EMPTY} + Log Adding predictor model requests to ${INFERENCESERVICE_FILLED_FILEPATH}: ${requests_dict} console=True # robocop: disable + FOR ${index} ${resource} IN ENUMERATE @{requests_dict.keys()} + Log ${index}- ${resource}:${requests_dict}[${resource}] + ${rc} ${out}= Run And Return Rc And Output + ... yq -i '.spec.predictor.model.resources.requests."${resource}" = "${requests_dict}[${resource}]"' ${INFERENCESERVICE_FILLED_FILEPATH} # robocop: disable + Should Be Equal As Integers ${rc} ${0} msg=${out} + END + END + IF ${limits_dict} != &{EMPTY} + Log Adding predictor model limits to ${INFERENCESERVICE_FILLED_FILEPATH}: ${limits_dict} console=True # robocop: disable + FOR ${index} ${resource} IN ENUMERATE @{limits_dict.keys()} + Log ${index}- ${resource}:${limits_dict}[${resource}] + ${rc} ${out}= Run And Return Rc And Output + ... yq -i '.spec.predictor.model.resources.limits."${resource}" = "${limits_dict}[${resource}]"' ${INFERENCESERVICE_FILLED_FILEPATH} # robocop: disable + Should Be Equal As Integers ${rc} ${0} msg=${out} + END + END + IF $kserve_mode is not None + ${rc} ${out}= Run And Return Rc And Output + ... yq -i '.metadata.annotations."serving.kserve.io/deploymentMode" = "${kserve_mode}"' ${INFERENCESERVICE_FILLED_FILEPATH} # robocop: disable + Should Be Equal As Integers ${rc} ${0} msg=${out} + ELSE + ${exists}= Run Keyword And Return Status Variable Should Exist ${DSC_KSERVE_MODE} + IF ${exists} # done in this way because when use non-admin users they cannot fetch DSC + ${mode}= Set Variable ${DSC_KSERVE_MODE} + ELSE + ${mode}= Get KServe Default Deployment Mode From DSC + END + Log message=Using defaultDeploymentMode set in the DSC: ${mode} + END From 855aa8b16f5108bc53e97b720dc4eae6a28b0b3a Mon Sep 17 00:00:00 2001 From: Raghul-M Date: Fri, 13 Dec 2024 17:51:09 +0530 Subject: [PATCH 2/4] Migration of Python model kserve rest testcase UI -> API --- .../Page/ODH/ODHDashboard/ODHModelServing.resource | 2 +- .../1009__model_serving_triton_on_kserve_api.robot | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource index 5a4baf790..3c0098e97 100644 --- a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource +++ b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource @@ -339,7 +339,7 @@ Get Model Inference ${rc} ${url}= Run And Return Rc And Output ... oc get ksvc ${model_name}-predictor -n ${project_title} -o jsonpath='{.status.url}' Should Be Equal As Integers ${rc} 0 - ${curl_cmd}= Set Variable curl -sk ${url}${end_point} -d ${inference_input} + ${curl_cmd}= Set Variable curl -ks ${url}${end_point} -d ${inference_input} ELSE IF '${kserve_mode}' == 'RawDeployment' ${url}= Set Variable http://localhost:${service_port}${end_point} ${curl_cmd}= Set Variable curl -s ${url} -d ${inference_input} --cacert openshift_ca_istio_knative.crt diff --git a/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot b/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot index 0a9c09726..08a2f8d39 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot @@ -30,23 +30,23 @@ ${LLM_RESOURCES_DIRPATH}= tests/Resources/Files/llm ${INFERENCESERVICE_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/serving_runtimes/base/isvc.yaml ${INFERENCESERVICE_FILEPATH_NEW}= ${LLM_RESOURCES_DIRPATH}/serving_runtimes/isvc ${INFERENCESERVICE_FILLED_FILEPATH}= ${INFERENCESERVICE_FILEPATH_NEW}/isvc_filled.yaml -${ONNX_RUNTIME_NAMEs}= triton-kserve-runtime +${KSERVE_RUNTIME_REST_NAME}= triton-kserve-runtime *** Test Cases *** Test Python Model Rest Inference Via API (Triton on Kserve) # robocop: off=too-long-test-case [Documentation] Test the deployment of python model in Kserve using Triton - [Tags] Tier2 Resources-GPU NVIDIA-GPUs RunThisTest + [Tags] Tier2 RHOAIENG-16912 Setup Test Variables model_name=${PYTHON_MODEL_NAME} use_pvc=${FALSE} use_gpu=${FALSE} ... kserve_mode=${KSERVE_MODE} model_path=triton/model_repository/ - Set Project And Runtime runtime=${ONNX_RUNTIME_NAMEs} protocol=${PROTOCOL} namespace=${test_namespace} + Set Project And Runtime runtime=${KSERVE_RUNTIME_REST_NAME} protocol=${PROTOCOL} namespace=${test_namespace} ... download_in_pvc=${DOWNLOAD_IN_PVC} model_name=${PYTHON_MODEL_NAME} ... storage_size=100Mi memory_request=100Mi ${requests}= Create Dictionary memory=1Gi Compile Inference Service YAML isvc_name=${PYTHON_MODEL_NAME} ... sa_name=models-bucket-sa ... model_storage_uri=${storage_uri} - ... model_format=python serving_runtime=${ONNX_RUNTIME_NAMEs} + ... model_format=python serving_runtime=${KSERVE_RUNTIME_REST_NAME} ... version="1" ... limits_dict=${limits} requests_dict=${requests} kserve_mode=${KSERVE_MODE} Deploy Model Via CLI isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH} From 8dc1f34ac0f6fd2e42f34eee5438fba407d453cb Mon Sep 17 00:00:00 2001 From: Raghul-M Date: Fri, 3 Jan 2025 15:58:37 +0530 Subject: [PATCH 3/4] Fixed PR comments --- .../Resources/CLI/ModelServing/llm.resource | 60 ++++++++- .../ODH/ODHDashboard/ODHModelServing.resource | 2 +- ...__model_serving_triton_on_kserve_api.robot | 121 +----------------- 3 files changed, 62 insertions(+), 121 deletions(-) diff --git a/ods_ci/tests/Resources/CLI/ModelServing/llm.resource b/ods_ci/tests/Resources/CLI/ModelServing/llm.resource index 1d70a0bf3..2cb697471 100644 --- a/ods_ci/tests/Resources/CLI/ModelServing/llm.resource +++ b/ods_ci/tests/Resources/CLI/ModelServing/llm.resource @@ -141,7 +141,7 @@ Compile Inference Service YAML [Arguments] ${isvc_name} ${model_storage_uri} ${model_format}=caikit ${serving_runtime}=caikit-tgis-runtime ... ${kserve_mode}=${NONE} ${sa_name}=${DEFAULT_BUCKET_SA_NAME} ${canaryTrafficPercent}=${EMPTY} ${min_replicas}=1 ... ${scaleTarget}=1 ${scaleMetric}=concurrency ${auto_scale}=${NONE} - ... ${requests_dict}=&{EMPTY} ${limits_dict}=&{EMPTY} ${overlays}=${EMPTY} + ... ${requests_dict}=&{EMPTY} ${limits_dict}=&{EMPTY} ${overlays}=${EMPTY} ${version}=${EMPTY} IF '${auto_scale}' == '${NONE}' ${scaleTarget}= Set Variable ${EMPTY} ${scaleMetric}= Set Variable ${EMPTY} @@ -154,6 +154,7 @@ Compile Inference Service YAML Set Test Variable ${scaleMetric} Set Test Variable ${canaryTrafficPercent} Set Test Variable ${model_format} + Set Test Variable ${version} Set Test Variable ${serving_runtime} IF len($overlays) > 0 FOR ${index} ${overlay} IN ENUMERATE @{overlays} @@ -415,6 +416,46 @@ Query Model Multiple Times END END +Setup Test Variables # robocop: off=too-many-calls-in-keyword + [Documentation] Sets up variables for the Suite + [Arguments] ${model_name} ${kserve_mode}=Serverless ${use_pvc}=${FALSE} ${use_gpu}=${FALSE} + ... ${model_path}=${model_name} + Set Test Variable ${model_name} + ${models_names}= Create List ${model_name} + Set Test Variable ${models_names} + Set Test Variable ${model_path} + Set Test Variable ${test_namespace} ${TEST_NS}-${model_name} + IF ${use_pvc} + Set Test Variable ${storage_uri} pvc://${model_name}-claim/${model_path} + ELSE + Set Test Variable ${storage_uri} s3://${S3.BUCKET_1.NAME}/${model_path} + END + IF ${use_gpu} + ${supported_gpu_type}= Convert To Lowercase ${GPU_TYPE} + Set Runtime Image ${supported_gpu_type} + IF "${supported_gpu_type}" == "nvidia" + ${limits}= Create Dictionary nvidia.com/gpu=1 + ELSE IF "${supported_gpu_type}" == "amd" + ${limits}= Create Dictionary amd.com/gpu=1 + ELSE + FAIL msg=Provided GPU type is not yet supported. Only nvidia and amd gpu type are supported + END + Set Test Variable ${limits} + ELSE + Set Test Variable ${limits} &{EMPTY} + END + IF "${KSERVE_MODE}" == "RawDeployment" # robocop: off=inconsistent-variable-name + Set Test Variable ${use_port_forwarding} ${TRUE} + ELSE + Set Test Variable ${use_port_forwarding} ${FALSE} + END + Set Log Level NONE + Set Test Variable ${access_key_id} ${S3.AWS_ACCESS_KEY_ID} + Set Test Variable ${access_key} ${S3.AWS_SECRET_ACCESS_KEY} + Set Test Variable ${endpoint} ${MODELS_BUCKET.ENDPOINT} + Set Test Variable ${region} ${MODELS_BUCKET.REGION} + Set Log Level INFO + Compile Deploy And Query LLM model [Documentation] Group together the test steps for preparing, deploying ... and querying a model @@ -910,3 +951,20 @@ Remove Model Mount Path From Runtime ${rc} ${out}= Run And Return Rc And Output ... oc patch servingruntime ${runtime} -n ${namespace} --type='json' -p='[{"op": "remove", "path": "/spec/containers/0/args/1"}]' Should Be Equal As Integers ${rc} ${0} msg=${out} + + +Set Runtime Image + [Documentation] Sets up runtime variables for the Suite + [Arguments] ${gpu_type} + IF "${RUNTIME_IMAGE}" == "${EMPTY}" + IF "${gpu_type}" == "nvidia" + Set Test Variable ${runtime_image} quay.io/modh/vllm@sha256:c86ff1e89c86bc9821b75d7f2bbc170b3c13e3ccf538bf543b1110f23e056316 + ELSE IF "${gpu_type}" == "amd" + Set Test Variable ${runtime_image} quay.io/modh/vllm@sha256:10f09eeca822ebe77e127aad7eca2571f859a5536a6023a1baffc6764bcadc6e + ELSE + FAIL msg=Provided GPU type is not yet supported. Only nvidia and amd gpu type are supported + END + ELSE + Log To Console msg= Using the image provided from terminal + END + diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource index 3c0098e97..1b3286ba6 100644 --- a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource +++ b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource @@ -339,7 +339,7 @@ Get Model Inference ${rc} ${url}= Run And Return Rc And Output ... oc get ksvc ${model_name}-predictor -n ${project_title} -o jsonpath='{.status.url}' Should Be Equal As Integers ${rc} 0 - ${curl_cmd}= Set Variable curl -ks ${url}${end_point} -d ${inference_input} + ${curl_cmd}= Set Variable curl -s ${url}${end_point} -d ${inference_input} --cacert openshift_ca_istio_knative.crt ELSE IF '${kserve_mode}' == 'RawDeployment' ${url}= Set Variable http://localhost:${service_port}${end_point} ${curl_cmd}= Set Variable curl -s ${url} -d ${inference_input} --cacert openshift_ca_istio_knative.crt diff --git a/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot b/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot index 08a2f8d39..d73fb5fa9 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot @@ -36,7 +36,7 @@ ${KSERVE_RUNTIME_REST_NAME}= triton-kserve-runtime *** Test Cases *** Test Python Model Rest Inference Via API (Triton on Kserve) # robocop: off=too-long-test-case [Documentation] Test the deployment of python model in Kserve using Triton - [Tags] Tier2 RHOAIENG-16912 + [Tags] Tier2 RHOAIENG-16912 RunThisTest Setup Test Variables model_name=${PYTHON_MODEL_NAME} use_pvc=${FALSE} use_gpu=${FALSE} ... kserve_mode=${KSERVE_MODE} model_path=triton/model_repository/ Set Project And Runtime runtime=${KSERVE_RUNTIME_REST_NAME} protocol=${PROTOCOL} namespace=${test_namespace} @@ -66,7 +66,7 @@ Test Python Model Rest Inference Via API (Triton on Kserve) # robocop: off=to Verify Model Inference With Retries model_name=${PYTHON_MODEL_NAME} inference_input=${INFERENCE_REST_INPUT_PYTHON} ... expected_inference_output=${EXPECTED_INFERENCE_REST_OUTPUT_PYTHON} project_title=${test_namespace} ... deployment_mode=Cli kserve_mode=${KSERVE_MODE} service_port=${service_port} - ... end_point=/v2/models/${model_name}/infer retries=3 + ... end_point=/v2/models/${model_name}/infer retries=3 [Teardown] Run Keywords ... Clean Up Test Project test_ns=${test_namespace} ... isvc_names=${models_names} wait_prj_deletion=${FALSE} kserve_mode=${KSERVE_MODE} @@ -88,120 +88,3 @@ Suite Teardown Set Default Storage Class In GCP default=standard-csi RHOSi Teardown -Setup Test Variables # robocop: off=too-many-calls-in-keyword - [Documentation] Sets up variables for the Suite - [Arguments] ${model_name} ${kserve_mode}=Serverless ${use_pvc}=${FALSE} ${use_gpu}=${FALSE} - ... ${model_path}=${model_name} - Set Test Variable ${model_name} - ${models_names}= Create List ${model_name} - Set Test Variable ${models_names} - Set Test Variable ${model_path} - Set Test Variable ${test_namespace} ${TEST_NS}-${model_name} - IF ${use_pvc} - Set Test Variable ${storage_uri} pvc://${model_name}-claim/${model_path} - ELSE - Set Test Variable ${storage_uri} s3://${S3.BUCKET_1.NAME}/${model_path} - END - IF ${use_gpu} - ${supported_gpu_type}= Convert To Lowercase ${GPU_TYPE} - Set Runtime Image ${supported_gpu_type} - IF "${supported_gpu_type}" == "nvidia" - ${limits}= Create Dictionary nvidia.com/gpu=1 - ELSE IF "${supported_gpu_type}" == "amd" - ${limits}= Create Dictionary amd.com/gpu=1 - ELSE - FAIL msg=Provided GPU type is not yet supported. Only nvidia and amd gpu type are supported - END - Set Test Variable ${limits} - ELSE - Set Test Variable ${limits} &{EMPTY} - END - IF "${KSERVE_MODE}" == "RawDeployment" # robocop: off=inconsistent-variable-name - Set Test Variable ${use_port_forwarding} ${TRUE} - ELSE - Set Test Variable ${use_port_forwarding} ${FALSE} - END - Set Log Level NONE - Set Test Variable ${access_key_id} ${S3.AWS_ACCESS_KEY_ID} - Set Test Variable ${access_key} ${S3.AWS_SECRET_ACCESS_KEY} - Set Test Variable ${endpoint} ${MODELS_BUCKET.ENDPOINT} - Set Test Variable ${region} ${MODELS_BUCKET.REGION} - Set Log Level INFO - -Set Runtime Image - [Documentation] Sets up runtime variables for the Suite - [Arguments] ${gpu_type} - IF "${RUNTIME_IMAGE}" == "${EMPTY}" - IF "${gpu_type}" == "nvidia" - Set Test Variable ${runtime_image} quay.io/modh/vllm@sha256:c86ff1e89c86bc9821b75d7f2bbc170b3c13e3ccf538bf543b1110f23e056316 - ELSE IF "${gpu_type}" == "amd" - Set Test Variable ${runtime_image} quay.io/modh/vllm@sha256:10f09eeca822ebe77e127aad7eca2571f859a5536a6023a1baffc6764bcadc6e - ELSE - FAIL msg=Provided GPU type is not yet supported. Only nvidia and amd gpu type are supported - END - ELSE - Log To Console msg= Using the image provided from terminal - END - -Compile Inference Service YAML - [Documentation] Prepare the Inference Service YAML file in order to deploy a model - [Arguments] ${isvc_name} ${model_storage_uri} ${model_format}=caikit ${serving_runtime}=caikit-tgis-runtime - ... ${kserve_mode}=${NONE} ${sa_name}=${DEFAULT_BUCKET_SA_NAME} ${canaryTrafficPercent}=${EMPTY} ${min_replicas}=1 - ... ${scaleTarget}=1 ${scaleMetric}=concurrency ${auto_scale}=${NONE} - ... ${requests_dict}=&{EMPTY} ${limits_dict}=&{EMPTY} ${overlays}=${EMPTY} ${version}=${EMPTY} - IF '${auto_scale}' == '${NONE}' - ${scaleTarget}= Set Variable ${EMPTY} - ${scaleMetric}= Set Variable ${EMPTY} - END - Set Test Variable ${isvc_name} - Set Test Variable ${min_replicas} - Set Test Variable ${sa_name} - Set Test Variable ${model_storage_uri} - Set Test Variable ${scaleTarget} - Set Test Variable ${scaleMetric} - Set Test Variable ${canaryTrafficPercent} - Set Test Variable ${model_format} - Set Test Variable ${version} - Set Test Variable ${serving_runtime} - IF len($overlays) > 0 - FOR ${index} ${overlay} IN ENUMERATE @{overlays} - Log ${index}: ${overlay} - ${rc} ${out}= Run And Return Rc And Output - ... oc kustomize ${LLM_RESOURCES_DIRPATH}/serving_runtimes/overlay/${overlay} > ${INFERENCESERVICE_FILLED_FILEPATH} - Should Be Equal As Integers ${rc} ${0} msg=${out} - END - Create File From Template ${INFERENCESERVICE_FILLED_FILEPATH} ${INFERENCESERVICE_FILLED_FILEPATH} - ELSE - Create File From Template ${INFERENCESERVICE_FILEPATH} ${INFERENCESERVICE_FILLED_FILEPATH} - END - IF ${requests_dict} != &{EMPTY} - Log Adding predictor model requests to ${INFERENCESERVICE_FILLED_FILEPATH}: ${requests_dict} console=True # robocop: disable - FOR ${index} ${resource} IN ENUMERATE @{requests_dict.keys()} - Log ${index}- ${resource}:${requests_dict}[${resource}] - ${rc} ${out}= Run And Return Rc And Output - ... yq -i '.spec.predictor.model.resources.requests."${resource}" = "${requests_dict}[${resource}]"' ${INFERENCESERVICE_FILLED_FILEPATH} # robocop: disable - Should Be Equal As Integers ${rc} ${0} msg=${out} - END - END - IF ${limits_dict} != &{EMPTY} - Log Adding predictor model limits to ${INFERENCESERVICE_FILLED_FILEPATH}: ${limits_dict} console=True # robocop: disable - FOR ${index} ${resource} IN ENUMERATE @{limits_dict.keys()} - Log ${index}- ${resource}:${limits_dict}[${resource}] - ${rc} ${out}= Run And Return Rc And Output - ... yq -i '.spec.predictor.model.resources.limits."${resource}" = "${limits_dict}[${resource}]"' ${INFERENCESERVICE_FILLED_FILEPATH} # robocop: disable - Should Be Equal As Integers ${rc} ${0} msg=${out} - END - END - IF $kserve_mode is not None - ${rc} ${out}= Run And Return Rc And Output - ... yq -i '.metadata.annotations."serving.kserve.io/deploymentMode" = "${kserve_mode}"' ${INFERENCESERVICE_FILLED_FILEPATH} # robocop: disable - Should Be Equal As Integers ${rc} ${0} msg=${out} - ELSE - ${exists}= Run Keyword And Return Status Variable Should Exist ${DSC_KSERVE_MODE} - IF ${exists} # done in this way because when use non-admin users they cannot fetch DSC - ${mode}= Set Variable ${DSC_KSERVE_MODE} - ELSE - ${mode}= Get KServe Default Deployment Mode From DSC - END - Log message=Using defaultDeploymentMode set in the DSC: ${mode} - END From 569ee66d18b9fdae679dc1a739775f5126d30af2 Mon Sep 17 00:00:00 2001 From: Raghul-M Date: Fri, 3 Jan 2025 16:07:25 +0530 Subject: [PATCH 4/4] removed Runthistest tag --- .../1009__model_serving_triton_on_kserve_api.robot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot b/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot index d73fb5fa9..e2b9f309f 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot @@ -36,7 +36,7 @@ ${KSERVE_RUNTIME_REST_NAME}= triton-kserve-runtime *** Test Cases *** Test Python Model Rest Inference Via API (Triton on Kserve) # robocop: off=too-long-test-case [Documentation] Test the deployment of python model in Kserve using Triton - [Tags] Tier2 RHOAIENG-16912 RunThisTest + [Tags] Tier2 RHOAIENG-16912 Setup Test Variables model_name=${PYTHON_MODEL_NAME} use_pvc=${FALSE} use_gpu=${FALSE} ... kserve_mode=${KSERVE_MODE} model_path=triton/model_repository/ Set Project And Runtime runtime=${KSERVE_RUNTIME_REST_NAME} protocol=${PROTOCOL} namespace=${test_namespace}