From 5b6212c247b0b2beeaaff04cb17806e80669c357 Mon Sep 17 00:00:00 2001
From: Berto D'Attoma <88311595+bdattoma@users.noreply.github.com>
Date: Thu, 26 Oct 2023 15:50:50 +0200
Subject: [PATCH] Add WatsonX metrics test (#985)

* add initial test for model metrics

Signed-off-by: bdattoma <bdattoma@redhat.com>

* add checks for query type and cpu usage

Signed-off-by: bdattoma <bdattoma@redhat.com>

* remove commented code

Signed-off-by: bdattoma <bdattoma@redhat.com>

* add kw to protected list

Signed-off-by: bdattoma <bdattoma@redhat.com>

* add docs and minor fixes

Signed-off-by: bdattoma <bdattoma@redhat.com>

* add polarion id

Signed-off-by: bdattoma <bdattoma@redhat.com>

* fix some alerts

Signed-off-by: bdattoma <bdattoma@redhat.com>

* increase metric check stability

Signed-off-by: bdattoma <bdattoma@redhat.com>

* fix expected model response

Signed-off-by: bdattoma <bdattoma@redhat.com>

---------

Signed-off-by: bdattoma <bdattoma@redhat.com>
---
 .../Files/llm/model_expected_responses.json   |   2 +-
 .../Resources/Files/llm/uwm_cm_conf.yaml      |  10 +
 .../Resources/Files/llm/uwm_cm_enable.yaml    |   8 +
 .../Page/ODH/Monitoring/Monitoring.resource   |  25 +++
 ods_ci/tests/Resources/RHOSi.resource         |   1 +
 .../422__model_serving_llm.robot              | 180 +++++++++++++++++-
 6 files changed, 220 insertions(+), 6 deletions(-)
 create mode 100644 ods_ci/tests/Resources/Files/llm/uwm_cm_conf.yaml
 create mode 100644 ods_ci/tests/Resources/Files/llm/uwm_cm_enable.yaml

diff --git a/ods_ci/tests/Resources/Files/llm/model_expected_responses.json b/ods_ci/tests/Resources/Files/llm/model_expected_responses.json
index 22d37db7a..32a219318 100644
--- a/ods_ci/tests/Resources/Files/llm/model_expected_responses.json
+++ b/ods_ci/tests/Resources/Files/llm/model_expected_responses.json
@@ -6,7 +6,7 @@
                 "flan-t5-small-caikit": {
                     "generatedTokenCount":  5,
                     "response_text": "74 degrees F",
-                    "streamed_response_text":   "{'details':{}}{'tokens':[{'text':'▁','logprob':-1.6961849927902222}],'details':{'generated_tokens':1}}{'generated_text':'74','tokens':[{'text':'74','logprob':-3.2507317066192627}],'details':{'generated_tokens':2}}{'generated_text':'degrees','tokens':[{'text':'▁degrees','logprob':-0.4324553906917572}],'details':{'generated_tokens':3}}{'generated_text':'F','tokens':[{'text':'▁F','logprob':-1.3610913753509521}],'details':{'generated_tokens':4}}{'tokens':[{'text':'\u003c/s\u003e','logprob':-0.010431881994009018}],'details':{'finish_reason':'EOS_TOKEN','generated_tokens':5}}"
+                    "streamed_response_text":   "{'details':{'input_token_count':'8'}}{'tokens':[{'text':'▁','logprob':-1.6961838006973267}],'details':{'generated_tokens':1}}{'generated_text':'74','tokens':[{'text':'74','logprob':-3.250730037689209}],'details':{'generated_tokens':2}}{'generated_text':'degrees','tokens':[{'text':'▁degrees','logprob':-0.4324559271335602}],'details':{'generated_tokens':3}}{'generated_text':'F','tokens':[{'text':'▁F','logprob':-1.361091136932373}],'details':{'generated_tokens':4}}{'tokens':[{'text':'\u003c/s\u003e','logprob':-0.010431881994009018}],'details':{'finish_reason':'EOS_TOKEN','generated_tokens':5}}"
                 },
                 "bloom-560m-caikit": {
                     "generatedTokenCount":  20,
diff --git a/ods_ci/tests/Resources/Files/llm/uwm_cm_conf.yaml b/ods_ci/tests/Resources/Files/llm/uwm_cm_conf.yaml
new file mode 100644
index 000000000..c3a15e86a
--- /dev/null
+++ b/ods_ci/tests/Resources/Files/llm/uwm_cm_conf.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: user-workload-monitoring-config
+  namespace: openshift-user-workload-monitoring
+data:
+  config.yaml: |
+    prometheus:
+      logLevel: debug 
+      retention: 15d #Change as needed
\ No newline at end of file
diff --git a/ods_ci/tests/Resources/Files/llm/uwm_cm_enable.yaml b/ods_ci/tests/Resources/Files/llm/uwm_cm_enable.yaml
new file mode 100644
index 000000000..710f4777a
--- /dev/null
+++ b/ods_ci/tests/Resources/Files/llm/uwm_cm_enable.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: cluster-monitoring-config
+  namespace: openshift-monitoring
+data:
+  config.yaml: |
+    enableUserWorkload: true
\ No newline at end of file
diff --git a/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource b/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource
index 8df0ca984..e456bea60 100644
--- a/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource
+++ b/ods_ci/tests/Resources/Page/ODH/Monitoring/Monitoring.resource
@@ -135,3 +135,28 @@ Suite Availability Teardown
     Run Keyword And Warn On Failure    Alerts Should Not Be Firing
     ...    pm_url=${pm_url}    pm_token=${pm_token}    expected-firing-alert=DeadManSnitch
     ...    message_prefix=Suite Trdwn: ${SUITE NAME}:
+
+Get OpenShift Thanos URL
+    [Documentation]    Fetches the thanos URL from the OpenShift cluster
+    ${url}=    Oc Get    kind=Route    name=thanos-querier    namespace=openshift-monitoring
+    ...    fields=['status.ingress[0].host']
+    RETURN   ${url}[0][status.ingress[0].host]
+
+Generate Thanos Token
+    [Documentation]    Fetch user token to access thanos-querier.
+    ${rc}    ${out}=    Run And Return Rc And Output    oc whoami -t
+    Should Be Equal As Integers    ${rc}    ${0}
+    RETURN    ${out}
+
+Get Thanos Metrics List
+    [Documentation]    Gets the list of metrics available in thanos-querier and their type
+    ...                (e.g., counter, histogram, etc)
+    [Arguments]    ${thanos_url}    ${thanos_token}    ${search_text}=${EMPTY}
+    ${cmd}=    Set Variable    curl -k -H "Authorization: Bearer ${thanos_token}" https://${thanos_url}/api/v1/label/__name__/values
+    IF    "${search_text}" == "${EMPTY}"
+        ${cmd}=    Catenate    ${cmd}    | jq '.data'
+    ELSE
+        ${cmd}=    Catenate    ${cmd} | jq '.data[]' | grep ${search_text}
+    END
+    ${rc}    ${out}=    Run And Return Rc And Output    ${cmd} | tr -d '"'
+    RETURN    ${out}
diff --git a/ods_ci/tests/Resources/RHOSi.resource b/ods_ci/tests/Resources/RHOSi.resource
index f73f069e2..9e9774135 100644
--- a/ods_ci/tests/Resources/RHOSi.resource
+++ b/ods_ci/tests/Resources/RHOSi.resource
@@ -42,6 +42,7 @@ Resource            Common.robot
 ...                         Fill Data Connection Form
 ...                         Create Secret For S3-Like Buckets
 ...                         Login To OCP Using API
+...                         Generate Thanos Token
 
 
 *** Keywords ***
diff --git a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/422__model_serving_llm.robot b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/422__model_serving_llm.robot
index c63136f9e..6504cd593 100644
--- a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/422__model_serving_llm.robot
+++ b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/422__model_serving_llm.robot
@@ -5,7 +5,7 @@ Resource          ../../../Resources/OCP.resource
 Resource          ../../../Resources/Page/Operators/ISVs.resource
 Library            OpenShiftLibrary
 Suite Setup       Install Model Serving Stack Dependencies
-# Suite Teardown
+Suite Teardown    RHOSi Teardown
 
 
 *** Variables ***
@@ -38,6 +38,8 @@ ${INFERENCESERVICE_FILEPATH}=    ${LLM_RESOURCES_DIRPATH}/caikit_isvc.yaml
 ${DEFAULT_BUCKET_SECRET_NAME}=    models-bucket-secret
 ${DEFAULT_BUCKET_SA_NAME}=        models-bucket-sa
 ${EXP_RESPONSES_FILEPATH}=    ${LLM_RESOURCES_DIRPATH}/model_expected_responses.json
+${UWM_ENABLE_FILEPATH}=    ${LLM_RESOURCES_DIRPATH}/uwm_cm_enable.yaml
+${UWM_CONFIG_FILEPATH}=    ${LLM_RESOURCES_DIRPATH}/uwm_cm_conf.yaml
 ${SKIP_PREREQS_INSTALL}=    ${FALSE}
 ${SCRIPT_BASED_INSTALL}=    ${FALSE}
 ${MODELS_BUCKET}=    ${S3.BUCKET_3}
@@ -464,13 +466,55 @@ Verify Runtime Upgrade Does Not Affect Deployed Models
     [Teardown]    Clean Up Test Project    test_ns=${test_namespace}
     ...    isvc_names=${models_names}
 
+Verify User Can Access Model Metrics From UWM
+    [Documentation]    Verifies that model metrics are available for users in the
+    ...                OpenShift monitoring system (UserWorkloadMonitoring)
+    ...                PARTIALLY DONE: it is checking number of requests, number of successful requests
+    ...                and model pod cpu usage. Waiting for a complete list of expected metrics and
+    ...                derived metrics.
+    [Tags]    ODS-2401    WatsonX
+    [Setup]    Set Project And Runtime    namespace=watsonx-metrics    enable_metrics=${TRUE}
+    ${test_namespace}=    Set Variable     watsonx-metrics
+    ${flan_model_name}=    Set Variable    flan-t5-small-caikit
+    ${models_names}=    Create List    ${flan_model_name}
+    ${thanos_url}=    Get OpenShift Thanos URL
+    ${token}=    Generate Thanos Token
+    Compile Inference Service YAML    isvc_name=${flan_model_name}
+    ...    sa_name=${DEFAULT_BUCKET_SA_NAME}
+    ...    model_storage_uri=${FLAN_STORAGE_URI}
+    Deploy Model Via CLI    isvc_filepath=${LLM_RESOURCES_DIRPATH}/caikit_isvc_filled.yaml
+    ...    namespace=${test_namespace}
+    Wait For Pods To Be Ready    label_selector=serving.kserve.io/inferenceservice=${flan_model_name}
+    ...    namespace=${test_namespace}
+    TGI Caikit And Istio Metrics Should Exist    thanos_url=${thanos_url}    thanos_token=${token}
+    Query Models And Check Responses Multiple Times    models_names=${models_names}
+    ...    endpoint=${CAIKIT_ALLTOKENS_ENDPOINT}    n_times=3
+    ...    namespace=${test_namespace}
+    Wait Until Keyword Succeeds    50 times    5s
+    ...    User Can Fetch Number Of Requests Over Defined Time    thanos_url=${thanos_url}    thanos_token=${token}
+    ...    model_name=${flan_model_name}    query_kind=single    namespace=${test_namespace}    period=5m    exp_value=3
+    Wait Until Keyword Succeeds    20 times    5s
+    ...    User Can Fetch Number Of Successful Requests Over Defined Time    thanos_url=${thanos_url}    thanos_token=${token}
+    ...    model_name=${flan_model_name}    namespace=${test_namespace}    period=5m    exp_value=3
+    Wait Until Keyword Succeeds    20 times    5s
+    ...    User Can Fetch CPU Utilization    thanos_url=${thanos_url}    thanos_token=${token}
+    ...    model_name=${flan_model_name}    namespace=${test_namespace}    period=5m
+    Query Models And Check Responses Multiple Times    models_names=${models_names}
+    ...    endpoint=${CAIKIT_STREAM_ENDPOINT}    n_times=1    streamed_response=${TRUE}
+    ...    namespace=${test_namespace}    query_idx=${0}
+    Wait Until Keyword Succeeds    30 times    5s
+    ...    User Can Fetch Number Of Requests Over Defined Time    thanos_url=${thanos_url}    thanos_token=${token}
+    ...    model_name=${flan_model_name}    query_kind=stream    namespace=${test_namespace}    period=5m    exp_value=1
+    [Teardown]    Clean Up Test Project    test_ns=${test_namespace}
+    ...    isvc_names=${models_names}
+
 
 *** Keywords ***
 Install Model Serving Stack Dependencies
     [Documentation]    Instaling And Configuring dependency operators: Service Mesh and Serverless.
     ...                This is likely going to change in the future and it will include a way to skip installation.
     ...                Caikit runtime will be shipped Out-of-the-box and will be removed from here.
-    # RHOSi Setup
+    RHOSi Setup
     IF    ${SKIP_PREREQS_INSTALL} == ${FALSE}
         IF    ${SCRIPT_BASED_INSTALL} == ${FALSE}
             Install Service Mesh Stack
@@ -706,12 +750,20 @@ Deploy Caikit Serving Runtime
     ...    oc apply -f ${CAIKIT_FILEPATH} -n ${namespace}
 
 Set Project And Runtime
-    [Arguments]    ${namespace}
+    [Documentation]    Creates the DS Project (if not exists), creates the data connection for the models,
+    ...                creates caikit runtime. This can be used as test setup
+    [Arguments]    ${namespace}    ${enable_metrics}=${FALSE}
     Set Up Test OpenShift Project    test_ns=${namespace}
     Create Secret For S3-Like Buckets    endpoint=${MODELS_BUCKET.ENDPOINT}
     ...    region=${MODELS_BUCKET.REGION}    namespace=${namespace}
     # temporary step - caikit will be shipped OOTB
     Deploy Caikit Serving Runtime    namespace=${namespace}
+    IF   ${enable_metrics} == ${TRUE}
+        Oc Apply    kind=ConfigMap    src=${UWM_ENABLE_FILEPATH}
+        Oc Apply    kind=ConfigMap    src=${UWM_CONFIG_FILEPATH}
+    ELSE
+        Log    message=Skipping UserWorkloadMonitoring enablement.
+    END
 
 Create Secret For S3-Like Buckets
     [Documentation]    Configures the cluster to fetch models from a S3-like bucket
@@ -830,6 +882,7 @@ Query Models And Check Responses Multiple Times
             ...    endpoint=${endpoint}
             ...    json_body=${body}    json_header=${header}
             ...    insecure=${TRUE}    skip_res_json=${streamed_response}
+            Log    ${res}
             Run Keyword And Continue On Failure
             ...    Model Response Should Match The Expectation    model_response=${res}    model_name=${model_name}
             ...    streamed_response=${streamed_response}    query_idx=${query_idx}
@@ -870,9 +923,8 @@ Run Install Script
         ${rc}=    Run And Watch Command    TARGET_OPERATOR=${SCRIPT_TARGET_OPERATOR} BREW_TAG=${SCRIPT_BREW_TAG} CHECK_UWM=false ./scripts/install/kserve-install.sh
         ...    cwd=caikit-tgis-serving/demo/kserve
     ELSE
-        ${rc}=    Run And Watch Command    TARGET_OPERATOR=${SCRIPT_TARGET_OPERATOR} CHECK_UWM=false ./scripts/install/kserve-install.sh
+        ${rc}=    Run And Watch Command    DEPLOY_ODH_OPERATOR=false TARGET_OPERATOR=${SCRIPT_TARGET_OPERATOR} CHECK_UWM=false ./scripts/install/kserve-install.sh
         ...    cwd=caikit-tgis-serving/demo/kserve
-
     END
     Should Be Equal As Integers    ${rc}    ${0}
 
@@ -894,3 +946,121 @@ Get Model Pods Creation Date And Image URL
     ...    oc get pod --selector serving.kserve.io/inferenceservice=${model_name} -n ${namespace} -ojson | jq '.items[].spec.containers[].image' | grep caikit-tgis    # robocop: disable
     Should Be Equal As Integers    ${rc}    ${0}
     RETURN    ${created_at}    ${caikitsha}
+
+User Can Fetch Number Of Requests Over Defined Time
+    [Documentation]    Fetches the `tgi_request_count` metric and checks that it reports the expected
+    ...                model information (name, namespace, pod name and type of request).
+    ...                If ${exp_value} is given, it checks also the metric value
+    [Arguments]    ${thanos_url}    ${thanos_token}    ${model_name}    ${namespace}
+    ...           ${query_kind}=single    ${period}=30m    ${exp_value}=${EMPTY}
+    ${resp}=    Prometheus.Run Query    https://${thanos_url}    ${thanos_token}    tgi_request_count[${period}]
+    Log    ${resp.json()["data"]}
+    Check Query Response Values    response=${resp}    exp_namespace=${namespace}
+    ...    exp_model_name=${model_name}    exp_query_kind=${query_kind}    exp_value=${exp_value}
+
+User Can Fetch Number Of Successful Requests Over Defined Time
+    [Documentation]    Fetches the `tgi_request_success` metric and checks that it reports the expected
+    ...                model information (name, namespace and type of request).
+    ...                If ${exp_value} is given, it checks also the metric value
+    [Arguments]    ${thanos_url}    ${thanos_token}    ${model_name}    ${namespace}
+    ...            ${query_kind}=single    ${period}=30m    ${exp_value}=${EMPTY}
+    ${resp}=    Prometheus.Run Query    https://${thanos_url}    ${thanos_token}    tgi_request_success[${period}]
+    Log    ${resp.json()["data"]}
+    Check Query Response Values    response=${resp}    exp_namespace=${namespace}
+    ...    exp_model_name=${model_name}    exp_query_kind=${query_kind}    exp_value=${exp_value}
+
+User Can Fetch CPU Utilization
+    [Documentation]    Fetches the `pod:container_cpu_usage:sum` metric and checks that it reports the expected
+    ...                model information (pod name and namespace).
+    ...                If ${exp_value} is given, it checks also the metric value
+    [Arguments]    ${thanos_url}    ${thanos_token}    ${namespace}    ${model_name}    ${period}=30m    ${exp_value}=${EMPTY}
+    ${resp}=    Prometheus.Run Query    https://${thanos_url}    ${thanos_token}    pod:container_cpu_usage:sum{namespace="${namespace}"}[${period}]
+    ${pod_name}=    Oc Get    kind=Pod    namespace=${namespace}
+    ...    label_selector=serving.kserve.io/inferenceservice=${model_name}
+    ...    fields=['metadata.name']
+    Log    ${resp.json()["data"]}
+    Check Query Response Values    response=${resp}    exp_namespace=${namespace}
+    ...    exp_pod_name=${pod_name}[0][metadata.name]    exp_value=${exp_value}
+
+TGI Caikit And Istio Metrics Should Exist
+    [Documentation]    Checks that the `tgi_`, `caikit_` and `istio_` metrics exist.
+    ...                Returns the list of metrics names
+    [Arguments]    ${thanos_url}    ${thanos_token}
+    ${tgi_metrics_names}=    Get Thanos Metrics List    thanos_url=${thanos_url}    thanos_token=${thanos_token}
+    ...    search_text=tgi
+    Should Not Be Empty    ${tgi_metrics_names}
+    ${tgi_metrics_names}=    Split To Lines    ${tgi_metrics_names}
+    ${caikit_metrics_names}=    Get Thanos Metrics List    thanos_url=${thanos_url}    thanos_token=${thanos_token}
+    ...    search_text=caikit
+    ${caikit_metrics_names}=    Split To Lines    ${caikit_metrics_names}
+    ${istio_metrics_names}=    Get Thanos Metrics List    thanos_url=${thanos_url}    thanos_token=${thanos_token}
+    ...    search_text=istio
+    ${istio_metrics_names}=    Split To Lines    ${istio_metrics_names}
+    ${metrics}=    Append To List    ${tgi_metrics_names}    @{caikit_metrics_names}    @{istio_metrics_names}
+    RETURN    ${metrics}
+
+Check Query Response Values    # robocop:disable
+    [Documentation]    Implements the metric checks for `User Can Fetch Number Of Requests Over Defined Time`
+    ...                `User Can Fetch Number Of Successful Requests Over Defined Time` and `User Can Fetch CPU Utilization`.
+    ...                It searches among the available metric values for the specific model
+    [Arguments]    ${response}    ${exp_namespace}    ${exp_model_name}=${EMPTY}    ${exp_query_kind}=${EMPTY}    ${exp_value}=${EMPTY}    ${exp_pod_name}=${EMPTY}
+    ${json_results}=    Set Variable    ${response.json()["data"]["result"]}
+    FOR    ${index}    ${result}    IN ENUMERATE    @{json_results}
+        Log    ${index}: ${result}
+        ${value_keyname}=    Run Keyword And Return Status
+        ...    Dictionary Should Contain Key    ${result}    value
+        IF    ${value_keyname} == ${TRUE}
+            ${curr_value}=    Set Variable    ${result["value"][-1]}
+        ELSE
+            ${curr_value}=    Set Variable    ${result["values"][-1][-1]}
+        END
+        ${source_namespace}=    Set Variable    ${result["metric"]["namespace"]}
+        ${checked}=    Run Keyword And Return Status    Should Be Equal As Strings    ${source_namespace}    ${exp_namespace}
+        IF    ${checked} == ${FALSE}
+            Continue For Loop
+        ELSE
+            Log    message=Metrics source namespaced succesfully checked. Going to next step.      
+        END
+        IF    "${exp_model_name}" != "${EMPTY}"
+            ${source_model}=    Set Variable    ${result["metric"]["job"]}
+            ${checked}=    Run Keyword And Return Status    Should Be Equal As Strings    ${source_model}
+            ...    ${exp_model_name}-metrics
+            IF    ${checked} == ${FALSE}
+                Continue For Loop
+            ELSE
+                Log    message=Metrics source model succesfully checked. Going to next step.      
+            END
+            IF    "${exp_query_kind}" != "${EMPTY}"
+                ${source_query_kind}=    Set Variable    ${result["metric"]["kind"]}
+                ${checked}=    Run Keyword And Return Status    Should Be Equal As Strings    ${source_query_kind}
+                ...    ${exp_query_kind}
+                IF    ${checked} == ${FALSE}
+                    Continue For Loop
+                ELSE
+                    Log    message=Metrics query kind succesfully checked. Going to next step.      
+                END
+            END
+        END
+        IF    "${exp_pod_name}" != "${EMPTY}"
+            ${source_pod}=    Set Variable    ${result["metric"]["pod"]}
+            ${checked}=    Run Keyword And Return Status    Should Be Equal As Strings    ${source_pod}
+            ...    ${exp_pod_name}
+            IF    ${checked} == ${FALSE}
+                Continue For Loop
+            ELSE
+                Log    message=Metrics source pod succesfully checked. Going to next step.      
+            END
+        END
+        IF    "${exp_value}" != "${EMPTY}"
+            Run Keyword And Continue On Failure    Should Be Equal As Strings    ${curr_value}    ${exp_value}
+        ELSE
+            Run Keyword And Continue On Failure    Should Not Be Empty    ${curr_value}
+        END
+        IF    ${checked} == ${TRUE}
+            Log    message=The desired query result has been found.
+            Exit For Loop
+        END
+    END
+    IF    ${checked} == ${FALSE}
+        Fail    msg=The metric you are looking for has not been found. Check the query parameter and try again 
+    END