From 5b6212c247b0b2beeaaff04cb17806e80669c357 Mon Sep 17 00:00:00 2001 From: Berto D'Attoma <88311595+bdattoma@users.noreply.github.com> Date: Thu, 26 Oct 2023 15:50:50 +0200 Subject: [PATCH] Add WatsonX metrics test (#985) * add initial test for model metrics Signed-off-by: bdattoma * add checks for query type and cpu usage Signed-off-by: bdattoma * remove commented code Signed-off-by: bdattoma * add kw to protected list Signed-off-by: bdattoma * add docs and minor fixes Signed-off-by: bdattoma * add polarion id Signed-off-by: bdattoma * fix some alerts Signed-off-by: bdattoma * increase metric check stability Signed-off-by: bdattoma * fix expected model response Signed-off-by: bdattoma --------- Signed-off-by: bdattoma --- .../Files/llm/model_expected_responses.json | 2 +- .../Resources/Files/llm/uwm_cm_conf.yaml | 10 + .../Resources/Files/llm/uwm_cm_enable.yaml | 8 + .../Page/ODH/Monitoring/Monitoring.resource | 25 +++ ods_ci/tests/Resources/RHOSi.resource | 1 + .../422__model_serving_llm.robot | 180 +++++++++++++++++- 6 files changed, 220 insertions(+), 6 deletions(-) create mode 100644 ods_ci/tests/Resources/Files/llm/uwm_cm_conf.yaml create mode 100644 ods_ci/tests/Resources/Files/llm/uwm_cm_enable.yaml diff --git a/ods_ci/tests/Resources/Files/llm/model_expected_responses.json b/ods_ci/tests/Resources/Files/llm/model_expected_responses.json index 22d37db7a..32a219318 100644 --- a/ods_ci/tests/Resources/Files/llm/model_expected_responses.json +++ b/ods_ci/tests/Resources/Files/llm/model_expected_responses.json @@ -6,7 +6,7 @@ "flan-t5-small-caikit": { "generatedTokenCount": 5, "response_text": "74 degrees F", - "streamed_response_text": "{'details':{}}{'tokens':[{'text':'▁','logprob':-1.6961849927902222}],'details':{'generated_tokens':1}}{'generated_text':'74','tokens':[{'text':'74','logprob':-3.2507317066192627}],'details':{'generated_tokens':2}}{'generated_text':'degrees','tokens':[{'text':'▁degrees','logprob':-0.4324553906917572}],'details':{'generated_tokens':3}}{'generated_text':'F','tokens':[{'text':'▁F','logprob':-1.3610913753509521}],'details':{'generated_tokens':4}}{'tokens':[{'text':'\u003c/s\u003e','logprob':-0.010431881994009018}],'details':{'finish_reason':'EOS_TOKEN','generated_tokens':5}}" + "streamed_response_text": "{'details':{'input_token_count':'8'}}{'tokens':[{'text':'▁','logprob':-1.6961838006973267}],'details':{'generated_tokens':1}}{'generated_text':'74','tokens':[{'text':'74','logprob':-3.250730037689209}],'details':{'generated_tokens':2}}{'generated_text':'degrees','tokens':[{'text':'▁degrees','logprob':-0.4324559271335602}],'details':{'generated_tokens':3}}{'generated_text':'F','tokens':[{'text':'▁F','logprob':-1.361091136932373}],'details':{'generated_tokens':4}}{'tokens':[{'text':'\u003c/s\u003e','logprob':-0.010431881994009018}],'details':{'finish_reason':'EOS_TOKEN','generated_tokens':5}}" }, "bloom-560m-caikit": { "generatedTokenCount": 20, diff --git a/ods_ci/tests/Resources/Files/llm/uwm_cm_conf.yaml b/ods_ci/tests/Resources/Files/llm/uwm_cm_conf.yaml new file mode 100644 index 000000000..c3a15e86a --- /dev/null +++ b/ods_ci/tests/Resources/Files/llm/uwm_cm_conf.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: user-workload-monitoring-config + namespace: openshift-user-workload-monitoring +data: + config.yaml: | + prometheus: + logLevel: debug + retention: 15d #Change as needed \ No newline at end of file diff --git a/ods_ci/tests/Resources/Files/llm/uwm_cm_enable.yaml b/ods_ci/tests/Resources/Files/llm/uwm_cm_enable.yaml new file mode 100644 index 000000000..710f4777a --- /dev/null +++ b/ods_ci/tests/Resources/Files/llm/uwm_cm_enable.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: cluster-monitoring-config + namespace: openshift-monitoring +data: + config.yaml: | + enableUserWorkload: true \ No newline at end of file diff --git a/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource b/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource index 8df0ca984..e456bea60 100644 --- a/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource +++ b/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource @@ -135,3 +135,28 @@ Suite Availability Teardown Run Keyword And Warn On Failure Alerts Should Not Be Firing ... pm_url=${pm_url} pm_token=${pm_token} expected-firing-alert=DeadManSnitch ... message_prefix=Suite Trdwn: ${SUITE NAME}: + +Get OpenShift Thanos URL + [Documentation] Fetches the thanos URL from the OpenShift cluster + ${url}= Oc Get kind=Route name=thanos-querier namespace=openshift-monitoring + ... fields=['status.ingress[0].host'] + RETURN ${url}[0][status.ingress[0].host] + +Generate Thanos Token + [Documentation] Fetch user token to access thanos-querier. + ${rc} ${out}= Run And Return Rc And Output oc whoami -t + Should Be Equal As Integers ${rc} ${0} + RETURN ${out} + +Get Thanos Metrics List + [Documentation] Gets the list of metrics available in thanos-querier and their type + ... (e.g., counter, histogram, etc) + [Arguments] ${thanos_url} ${thanos_token} ${search_text}=${EMPTY} + ${cmd}= Set Variable curl -k -H "Authorization: Bearer ${thanos_token}" https://${thanos_url}/api/v1/label/__name__/values + IF "${search_text}" == "${EMPTY}" + ${cmd}= Catenate ${cmd} | jq '.data' + ELSE + ${cmd}= Catenate ${cmd} | jq '.data[]' | grep ${search_text} + END + ${rc} ${out}= Run And Return Rc And Output ${cmd} | tr -d '"' + RETURN ${out} diff --git a/ods_ci/tests/Resources/RHOSi.resource b/ods_ci/tests/Resources/RHOSi.resource index f73f069e2..9e9774135 100644 --- a/ods_ci/tests/Resources/RHOSi.resource +++ b/ods_ci/tests/Resources/RHOSi.resource @@ -42,6 +42,7 @@ Resource Common.robot ... Fill Data Connection Form ... Create Secret For S3-Like Buckets ... Login To OCP Using API +... Generate Thanos Token *** Keywords *** diff --git a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/422__model_serving_llm.robot b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/422__model_serving_llm.robot index c63136f9e..6504cd593 100644 --- a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/422__model_serving_llm.robot +++ b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/422__model_serving_llm.robot @@ -5,7 +5,7 @@ Resource ../../../Resources/OCP.resource Resource ../../../Resources/Page/Operators/ISVs.resource Library OpenShiftLibrary Suite Setup Install Model Serving Stack Dependencies -# Suite Teardown +Suite Teardown RHOSi Teardown *** Variables *** @@ -38,6 +38,8 @@ ${INFERENCESERVICE_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/caikit_isvc.yaml ${DEFAULT_BUCKET_SECRET_NAME}= models-bucket-secret ${DEFAULT_BUCKET_SA_NAME}= models-bucket-sa ${EXP_RESPONSES_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/model_expected_responses.json +${UWM_ENABLE_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/uwm_cm_enable.yaml +${UWM_CONFIG_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/uwm_cm_conf.yaml ${SKIP_PREREQS_INSTALL}= ${FALSE} ${SCRIPT_BASED_INSTALL}= ${FALSE} ${MODELS_BUCKET}= ${S3.BUCKET_3} @@ -464,13 +466,55 @@ Verify Runtime Upgrade Does Not Affect Deployed Models [Teardown] Clean Up Test Project test_ns=${test_namespace} ... isvc_names=${models_names} +Verify User Can Access Model Metrics From UWM + [Documentation] Verifies that model metrics are available for users in the + ... OpenShift monitoring system (UserWorkloadMonitoring) + ... PARTIALLY DONE: it is checking number of requests, number of successful requests + ... and model pod cpu usage. Waiting for a complete list of expected metrics and + ... derived metrics. + [Tags] ODS-2401 WatsonX + [Setup] Set Project And Runtime namespace=watsonx-metrics enable_metrics=${TRUE} + ${test_namespace}= Set Variable watsonx-metrics + ${flan_model_name}= Set Variable flan-t5-small-caikit + ${models_names}= Create List ${flan_model_name} + ${thanos_url}= Get OpenShift Thanos URL + ${token}= Generate Thanos Token + Compile Inference Service YAML isvc_name=${flan_model_name} + ... sa_name=${DEFAULT_BUCKET_SA_NAME} + ... model_storage_uri=${FLAN_STORAGE_URI} + Deploy Model Via CLI isvc_filepath=${LLM_RESOURCES_DIRPATH}/caikit_isvc_filled.yaml + ... namespace=${test_namespace} + Wait For Pods To Be Ready label_selector=serving.kserve.io/inferenceservice=${flan_model_name} + ... namespace=${test_namespace} + TGI Caikit And Istio Metrics Should Exist thanos_url=${thanos_url} thanos_token=${token} + Query Models And Check Responses Multiple Times models_names=${models_names} + ... endpoint=${CAIKIT_ALLTOKENS_ENDPOINT} n_times=3 + ... namespace=${test_namespace} + Wait Until Keyword Succeeds 50 times 5s + ... User Can Fetch Number Of Requests Over Defined Time thanos_url=${thanos_url} thanos_token=${token} + ... model_name=${flan_model_name} query_kind=single namespace=${test_namespace} period=5m exp_value=3 + Wait Until Keyword Succeeds 20 times 5s + ... User Can Fetch Number Of Successful Requests Over Defined Time thanos_url=${thanos_url} thanos_token=${token} + ... model_name=${flan_model_name} namespace=${test_namespace} period=5m exp_value=3 + Wait Until Keyword Succeeds 20 times 5s + ... User Can Fetch CPU Utilization thanos_url=${thanos_url} thanos_token=${token} + ... model_name=${flan_model_name} namespace=${test_namespace} period=5m + Query Models And Check Responses Multiple Times models_names=${models_names} + ... endpoint=${CAIKIT_STREAM_ENDPOINT} n_times=1 streamed_response=${TRUE} + ... namespace=${test_namespace} query_idx=${0} + Wait Until Keyword Succeeds 30 times 5s + ... User Can Fetch Number Of Requests Over Defined Time thanos_url=${thanos_url} thanos_token=${token} + ... model_name=${flan_model_name} query_kind=stream namespace=${test_namespace} period=5m exp_value=1 + [Teardown] Clean Up Test Project test_ns=${test_namespace} + ... isvc_names=${models_names} + *** Keywords *** Install Model Serving Stack Dependencies [Documentation] Instaling And Configuring dependency operators: Service Mesh and Serverless. ... This is likely going to change in the future and it will include a way to skip installation. ... Caikit runtime will be shipped Out-of-the-box and will be removed from here. - # RHOSi Setup + RHOSi Setup IF ${SKIP_PREREQS_INSTALL} == ${FALSE} IF ${SCRIPT_BASED_INSTALL} == ${FALSE} Install Service Mesh Stack @@ -706,12 +750,20 @@ Deploy Caikit Serving Runtime ... oc apply -f ${CAIKIT_FILEPATH} -n ${namespace} Set Project And Runtime - [Arguments] ${namespace} + [Documentation] Creates the DS Project (if not exists), creates the data connection for the models, + ... creates caikit runtime. This can be used as test setup + [Arguments] ${namespace} ${enable_metrics}=${FALSE} Set Up Test OpenShift Project test_ns=${namespace} Create Secret For S3-Like Buckets endpoint=${MODELS_BUCKET.ENDPOINT} ... region=${MODELS_BUCKET.REGION} namespace=${namespace} # temporary step - caikit will be shipped OOTB Deploy Caikit Serving Runtime namespace=${namespace} + IF ${enable_metrics} == ${TRUE} + Oc Apply kind=ConfigMap src=${UWM_ENABLE_FILEPATH} + Oc Apply kind=ConfigMap src=${UWM_CONFIG_FILEPATH} + ELSE + Log message=Skipping UserWorkloadMonitoring enablement. + END Create Secret For S3-Like Buckets [Documentation] Configures the cluster to fetch models from a S3-like bucket @@ -830,6 +882,7 @@ Query Models And Check Responses Multiple Times ... endpoint=${endpoint} ... json_body=${body} json_header=${header} ... insecure=${TRUE} skip_res_json=${streamed_response} + Log ${res} Run Keyword And Continue On Failure ... Model Response Should Match The Expectation model_response=${res} model_name=${model_name} ... streamed_response=${streamed_response} query_idx=${query_idx} @@ -870,9 +923,8 @@ Run Install Script ${rc}= Run And Watch Command TARGET_OPERATOR=${SCRIPT_TARGET_OPERATOR} BREW_TAG=${SCRIPT_BREW_TAG} CHECK_UWM=false ./scripts/install/kserve-install.sh ... cwd=caikit-tgis-serving/demo/kserve ELSE - ${rc}= Run And Watch Command TARGET_OPERATOR=${SCRIPT_TARGET_OPERATOR} CHECK_UWM=false ./scripts/install/kserve-install.sh + ${rc}= Run And Watch Command DEPLOY_ODH_OPERATOR=false TARGET_OPERATOR=${SCRIPT_TARGET_OPERATOR} CHECK_UWM=false ./scripts/install/kserve-install.sh ... cwd=caikit-tgis-serving/demo/kserve - END Should Be Equal As Integers ${rc} ${0} @@ -894,3 +946,121 @@ Get Model Pods Creation Date And Image URL ... oc get pod --selector serving.kserve.io/inferenceservice=${model_name} -n ${namespace} -ojson | jq '.items[].spec.containers[].image' | grep caikit-tgis # robocop: disable Should Be Equal As Integers ${rc} ${0} RETURN ${created_at} ${caikitsha} + +User Can Fetch Number Of Requests Over Defined Time + [Documentation] Fetches the `tgi_request_count` metric and checks that it reports the expected + ... model information (name, namespace, pod name and type of request). + ... If ${exp_value} is given, it checks also the metric value + [Arguments] ${thanos_url} ${thanos_token} ${model_name} ${namespace} + ... ${query_kind}=single ${period}=30m ${exp_value}=${EMPTY} + ${resp}= Prometheus.Run Query https://${thanos_url} ${thanos_token} tgi_request_count[${period}] + Log ${resp.json()["data"]} + Check Query Response Values response=${resp} exp_namespace=${namespace} + ... exp_model_name=${model_name} exp_query_kind=${query_kind} exp_value=${exp_value} + +User Can Fetch Number Of Successful Requests Over Defined Time + [Documentation] Fetches the `tgi_request_success` metric and checks that it reports the expected + ... model information (name, namespace and type of request). + ... If ${exp_value} is given, it checks also the metric value + [Arguments] ${thanos_url} ${thanos_token} ${model_name} ${namespace} + ... ${query_kind}=single ${period}=30m ${exp_value}=${EMPTY} + ${resp}= Prometheus.Run Query https://${thanos_url} ${thanos_token} tgi_request_success[${period}] + Log ${resp.json()["data"]} + Check Query Response Values response=${resp} exp_namespace=${namespace} + ... exp_model_name=${model_name} exp_query_kind=${query_kind} exp_value=${exp_value} + +User Can Fetch CPU Utilization + [Documentation] Fetches the `pod:container_cpu_usage:sum` metric and checks that it reports the expected + ... model information (pod name and namespace). + ... If ${exp_value} is given, it checks also the metric value + [Arguments] ${thanos_url} ${thanos_token} ${namespace} ${model_name} ${period}=30m ${exp_value}=${EMPTY} + ${resp}= Prometheus.Run Query https://${thanos_url} ${thanos_token} pod:container_cpu_usage:sum{namespace="${namespace}"}[${period}] + ${pod_name}= Oc Get kind=Pod namespace=${namespace} + ... label_selector=serving.kserve.io/inferenceservice=${model_name} + ... fields=['metadata.name'] + Log ${resp.json()["data"]} + Check Query Response Values response=${resp} exp_namespace=${namespace} + ... exp_pod_name=${pod_name}[0][metadata.name] exp_value=${exp_value} + +TGI Caikit And Istio Metrics Should Exist + [Documentation] Checks that the `tgi_`, `caikit_` and `istio_` metrics exist. + ... Returns the list of metrics names + [Arguments] ${thanos_url} ${thanos_token} + ${tgi_metrics_names}= Get Thanos Metrics List thanos_url=${thanos_url} thanos_token=${thanos_token} + ... search_text=tgi + Should Not Be Empty ${tgi_metrics_names} + ${tgi_metrics_names}= Split To Lines ${tgi_metrics_names} + ${caikit_metrics_names}= Get Thanos Metrics List thanos_url=${thanos_url} thanos_token=${thanos_token} + ... search_text=caikit + ${caikit_metrics_names}= Split To Lines ${caikit_metrics_names} + ${istio_metrics_names}= Get Thanos Metrics List thanos_url=${thanos_url} thanos_token=${thanos_token} + ... search_text=istio + ${istio_metrics_names}= Split To Lines ${istio_metrics_names} + ${metrics}= Append To List ${tgi_metrics_names} @{caikit_metrics_names} @{istio_metrics_names} + RETURN ${metrics} + +Check Query Response Values # robocop:disable + [Documentation] Implements the metric checks for `User Can Fetch Number Of Requests Over Defined Time` + ... `User Can Fetch Number Of Successful Requests Over Defined Time` and `User Can Fetch CPU Utilization`. + ... It searches among the available metric values for the specific model + [Arguments] ${response} ${exp_namespace} ${exp_model_name}=${EMPTY} ${exp_query_kind}=${EMPTY} ${exp_value}=${EMPTY} ${exp_pod_name}=${EMPTY} + ${json_results}= Set Variable ${response.json()["data"]["result"]} + FOR ${index} ${result} IN ENUMERATE @{json_results} + Log ${index}: ${result} + ${value_keyname}= Run Keyword And Return Status + ... Dictionary Should Contain Key ${result} value + IF ${value_keyname} == ${TRUE} + ${curr_value}= Set Variable ${result["value"][-1]} + ELSE + ${curr_value}= Set Variable ${result["values"][-1][-1]} + END + ${source_namespace}= Set Variable ${result["metric"]["namespace"]} + ${checked}= Run Keyword And Return Status Should Be Equal As Strings ${source_namespace} ${exp_namespace} + IF ${checked} == ${FALSE} + Continue For Loop + ELSE + Log message=Metrics source namespaced succesfully checked. Going to next step. + END + IF "${exp_model_name}" != "${EMPTY}" + ${source_model}= Set Variable ${result["metric"]["job"]} + ${checked}= Run Keyword And Return Status Should Be Equal As Strings ${source_model} + ... ${exp_model_name}-metrics + IF ${checked} == ${FALSE} + Continue For Loop + ELSE + Log message=Metrics source model succesfully checked. Going to next step. + END + IF "${exp_query_kind}" != "${EMPTY}" + ${source_query_kind}= Set Variable ${result["metric"]["kind"]} + ${checked}= Run Keyword And Return Status Should Be Equal As Strings ${source_query_kind} + ... ${exp_query_kind} + IF ${checked} == ${FALSE} + Continue For Loop + ELSE + Log message=Metrics query kind succesfully checked. Going to next step. + END + END + END + IF "${exp_pod_name}" != "${EMPTY}" + ${source_pod}= Set Variable ${result["metric"]["pod"]} + ${checked}= Run Keyword And Return Status Should Be Equal As Strings ${source_pod} + ... ${exp_pod_name} + IF ${checked} == ${FALSE} + Continue For Loop + ELSE + Log message=Metrics source pod succesfully checked. Going to next step. + END + END + IF "${exp_value}" != "${EMPTY}" + Run Keyword And Continue On Failure Should Be Equal As Strings ${curr_value} ${exp_value} + ELSE + Run Keyword And Continue On Failure Should Not Be Empty ${curr_value} + END + IF ${checked} == ${TRUE} + Log message=The desired query result has been found. + Exit For Loop + END + END + IF ${checked} == ${FALSE} + Fail msg=The metric you are looking for has not been found. Check the query parameter and try again + END