From 868ab73fc08153a4365a81b1d23bf1b735dcf4a9 Mon Sep 17 00:00:00 2001 From: RAGHUL M Date: Mon, 13 Jan 2025 20:13:10 +0530 Subject: [PATCH] Migration of Python model kserve grpc testcase UI -> API (#2155) --- .../Resources/CLI/ModelServing/llm.resource | 5 +- .../triton_servingruntime_grpc.yaml | 56 +++++++++++++++++++ ...n => kserve-triton-python-grpc-input.json} | 0 ... => kserve-triton-python-grpc-output.json} | 0 ...__model_serving_triton_on_kserve_api.robot | 49 ++++++++++++++++ 5 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 ods_ci/tests/Resources/Files/llm/serving_runtimes/triton_servingruntime_grpc.yaml rename ods_ci/tests/Resources/Files/triton/{kserve-triton-python-gRPC-input.json => kserve-triton-python-grpc-input.json} (100%) rename ods_ci/tests/Resources/Files/triton/{kserve-triton-python-gRPC-output.json => kserve-triton-python-grpc-output.json} (100%) diff --git a/ods_ci/tests/Resources/CLI/ModelServing/llm.resource b/ods_ci/tests/Resources/CLI/ModelServing/llm.resource index 2cb697471..1a3d1263a 100644 --- a/ods_ci/tests/Resources/CLI/ModelServing/llm.resource +++ b/ods_ci/tests/Resources/CLI/ModelServing/llm.resource @@ -30,7 +30,6 @@ ${SERVICEMESH_CR_NS}= istio-system ... triton-kserve-runtime=${LLM_RESOURCES_DIRPATH}/serving_runtimes/triton_servingruntime_{{protocol}}.yaml # robocop: disable ${DOWNLOAD_PVC_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/download_model_in_pvc.yaml ${DOWNLOAD_PVC_FILLED_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/download_model_in_pvc_filled.yaml - ${DOWNLOAD_PROMPTS_PVC_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/download_prompts_in_pvc.yaml ${DOWNLOAD_PROMPTS_PVC_FILLED_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/download_prompts_in_pvc_filled.yaml ${MATCHING_RATIO}= ${60} @@ -141,7 +140,7 @@ Compile Inference Service YAML [Arguments] ${isvc_name} ${model_storage_uri} ${model_format}=caikit ${serving_runtime}=caikit-tgis-runtime ... ${kserve_mode}=${NONE} ${sa_name}=${DEFAULT_BUCKET_SA_NAME} ${canaryTrafficPercent}=${EMPTY} ${min_replicas}=1 ... ${scaleTarget}=1 ${scaleMetric}=concurrency ${auto_scale}=${NONE} - ... ${requests_dict}=&{EMPTY} ${limits_dict}=&{EMPTY} ${overlays}=${EMPTY} ${version}=${EMPTY} + ... ${requests_dict}=&{EMPTY} ${limits_dict}=&{EMPTY} ${overlays}=${EMPTY} ${version}=${EMPTY} IF '${auto_scale}' == '${NONE}' ${scaleTarget}= Set Variable ${EMPTY} ${scaleMetric}= Set Variable ${EMPTY} @@ -199,7 +198,6 @@ Compile Inference Service YAML Log message=Using defaultDeploymentMode set in the DSC: ${mode} END - Model Response Should Match The Expectation [Documentation] Checks that the actual model response matches the expected answer. ... The goals are: @@ -952,7 +950,6 @@ Remove Model Mount Path From Runtime ... oc patch servingruntime ${runtime} -n ${namespace} --type='json' -p='[{"op": "remove", "path": "/spec/containers/0/args/1"}]' Should Be Equal As Integers ${rc} ${0} msg=${out} - Set Runtime Image [Documentation] Sets up runtime variables for the Suite [Arguments] ${gpu_type} diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/triton_servingruntime_grpc.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/triton_servingruntime_grpc.yaml new file mode 100644 index 000000000..2e9e15905 --- /dev/null +++ b/ods_ci/tests/Resources/Files/llm/serving_runtimes/triton_servingruntime_grpc.yaml @@ -0,0 +1,56 @@ +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + name: triton-kserve-runtime +spec: + annotations: + prometheus.kserve.io/path: /metrics + prometheus.kserve.io/port: "8002" + containers: + - args: + - tritonserver + - --model-store=/mnt/models + - --grpc-port=9000 + - --http-port=8080 + - --allow-grpc=true + - --allow-http=true + image: nvcr.io/nvidia/tritonserver:24.10-py3 + name: kserve-container + ports: + - containerPort: 9000 + name: h2c + protocol: TCP + resources: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "1" + memory: 2Gi + protocolVersions: + - v2 + - grpc-v2 + supportedModelFormats: + - autoSelect: true + name: tensorrt + version: "8" + - autoSelect: true + name: tensorflow + version: "1" + - autoSelect: true + name: tensorflow + version: "2" + - autoSelect: true + name: onnx + version: "1" + - name: pytorch + version: "1" + - autoSelect: true + name: triton + version: "2" + - autoSelect: true + name: xgboost + version: "1" + - autoSelect: true + name: python + version: "1" diff --git a/ods_ci/tests/Resources/Files/triton/kserve-triton-python-gRPC-input.json b/ods_ci/tests/Resources/Files/triton/kserve-triton-python-grpc-input.json similarity index 100% rename from ods_ci/tests/Resources/Files/triton/kserve-triton-python-gRPC-input.json rename to ods_ci/tests/Resources/Files/triton/kserve-triton-python-grpc-input.json diff --git a/ods_ci/tests/Resources/Files/triton/kserve-triton-python-gRPC-output.json b/ods_ci/tests/Resources/Files/triton/kserve-triton-python-grpc-output.json similarity index 100% rename from ods_ci/tests/Resources/Files/triton/kserve-triton-python-gRPC-output.json rename to ods_ci/tests/Resources/Files/triton/kserve-triton-python-grpc-output.json diff --git a/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot b/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot index e2b9f309f..371c8e210 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1009__model_serving_triton_on_kserve/1009__model_serving_triton_on_kserve_api.robot @@ -19,6 +19,10 @@ Test Tags Kserve *** Variables *** ${PYTHON_MODEL_NAME}= python +${EXPECTED_INFERENCE_GRPC_OUTPUT_PYTHON}= {"modelName":"python","modelVersion":"1","id":"1","outputs":[{"name":"OUTPUT0","datatype":"FP32","shape":["4"]},{"name":"OUTPUT1","datatype":"FP32","shape":["4"]}],"rawOutputContents":["AgAAAAAAAAAAAAAAAAAAAA==","AAQAAAAAAAAAAAAAAAAAAA=="]} +${INFERENCE_GRPC_INPUT_PYTHONFILE}= tests/Resources/Files/triton/kserve-triton-python-grpc-input.json +${KSERVE_MODE}= Serverless # Serverless +${PROTOCOL_GRPC}= grpc ${EXPECTED_INFERENCE_REST_OUTPUT_PYTHON}= {"model_name":"python","model_version":"1","outputs":[{"name":"OUTPUT0","datatype":"FP32","shape":[4],"data":[0.921442985534668,0.6223347187042236,0.8059385418891907,1.2578542232513428]},{"name":"OUTPUT1","datatype":"FP32","shape":[4],"data":[0.49091365933418274,-0.027157962322235107,-0.5641784071922302,0.6906309723854065]}]} ${INFERENCE_REST_INPUT_PYTHON}= @tests/Resources/Files/triton/kserve-triton-python-rest-input.json ${KSERVE_MODE}= Serverless # Serverless @@ -31,6 +35,8 @@ ${INFERENCESERVICE_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/serving_runtimes/base/ ${INFERENCESERVICE_FILEPATH_NEW}= ${LLM_RESOURCES_DIRPATH}/serving_runtimes/isvc ${INFERENCESERVICE_FILLED_FILEPATH}= ${INFERENCESERVICE_FILEPATH_NEW}/isvc_filled.yaml ${KSERVE_RUNTIME_REST_NAME}= triton-kserve-runtime +${PATTERN}= https:\/\/([^\/:]+) +${PROTOBUFF_FILE}= tests/Resources/Files/triton/grpc_predict_v2.proto *** Test Cases *** @@ -73,6 +79,49 @@ Test Python Model Rest Inference Via API (Triton on Kserve) # robocop: off=to ... AND ... Run Keyword If "${KSERVE_MODE}"=="RawDeployment" Terminate Process triton-process kill=true +Test Python Model Grpc Inference Via API (Triton on Kserve) # robocop: off=too-long-test-case + [Documentation] Test the deployment of python model in Kserve using Triton + [Tags] Tier2 RHOAIENG-16912 + + Setup Test Variables model_name=${PYTHON_MODEL_NAME} use_pvc=${FALSE} use_gpu=${FALSE} + ... kserve_mode=${KSERVE_MODE} model_path=triton/model_repository/ + Set Project And Runtime runtime=${KSERVE_RUNTIME_REST_NAME} protocol=${PROTOCOL_GRPC} namespace=${test_namespace} + ... download_in_pvc=${DOWNLOAD_IN_PVC} model_name=${PYTHON_MODEL_NAME} + ... storage_size=100Mi memory_request=100Mi + ${requests}= Create Dictionary memory=1Gi + Compile Inference Service YAML isvc_name=${PYTHON_MODEL_NAME} + ... sa_name=models-bucket-sa + ... model_storage_uri=${storage_uri} + ... model_format=python serving_runtime=${KSERVE_RUNTIME_REST_NAME} + ... version="1" + ... limits_dict=${limits} requests_dict=${requests} kserve_mode=${KSERVE_MODE} + Deploy Model Via CLI isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH} + ... namespace=${test_namespace} + # File is not needed anymore after applying + Remove File ${INFERENCESERVICE_FILLED_FILEPATH} + Wait For Pods To Be Ready label_selector=serving.kserve.io/inferenceservice=${PYTHON_MODEL_NAME} + ... namespace=${test_namespace} + ${pod_name}= Get Pod Name namespace=${test_namespace} + ... label_selector=serving.kserve.io/inferenceservice=${PYTHON_MODEL_NAME} + ${valued} ${host}= Run And Return Rc And Output oc get ksvc ${PYTHON_MODEL_NAME}-predictor -o jsonpath='{.status.url}' + Log ${valued} + ${host}= Evaluate re.search(r"${PATTERN}", r"${host}").group(1) re + Log ${host} + ${inference_output}= Query Model With GRPCURL host=${host} port=443 + ... endpoint=inference.GRPCInferenceService/ModelInfer + ... json_body=@ input_filepath=${INFERENCE_GRPC_INPUT_PYTHONFILE} + ... insecure=${True} protobuf_file=${PROTOBUFF_FILE} json_header=${NONE} + ${inference_output}= Evaluate json.dumps(${inference_output}) + Log ${inference_output} + ${result} ${list}= Inference Comparison ${EXPECTED_INFERENCE_GRPC_OUTPUT_PYTHON} ${inference_output} + Log ${result} + Log ${list} + [Teardown] Run Keywords + ... Clean Up Test Project test_ns=${test_namespace} + ... isvc_names=${models_names} wait_prj_deletion=${FALSE} kserve_mode=${KSERVE_MODE} + ... AND + ... Run Keyword If "${KSERVE_MODE}"=="RawDeployment" Terminate Process triton-process kill=true + *** Keywords *** Suite Setup