Skip to content

Commit

Permalink
Merge branch 'master' into fix_uninstall_rhoai_master
Browse files Browse the repository at this point in the history
  • Loading branch information
kobihk authored Jan 13, 2025
2 parents 6f0cf73 + 58b325d commit ca19a86
Show file tree
Hide file tree
Showing 26 changed files with 428 additions and 80 deletions.
31 changes: 27 additions & 4 deletions ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,32 @@ EOF
fi
}

function applyWorkaroundForOlderOCPVersions () {
# workaround for OCP versions less than 4.16
# AMD certified operator is published starting from OCP v4.16
ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"')
IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion"
if [ "${ocpVersionSplit[1]}" -lt 16 ]; then
echo "OCP Version: $ocpVersion"
echo "AMD Operator is not available for versions < 4.16, hence creating custom catalog source as workaround"
oc apply -f - <<EOF
apiVersion: operators.coreos.com/v1alpha1
kind: CatalogSource
metadata:
name: certified-operators-416-amd
namespace: openshift-marketplace
spec:
displayName: Certfied operator
image: 'registry.redhat.io/redhat/certified-operator-index:v4.16'
publisher: RHOAI QE
sourceType: grpc
EOF
oc wait --timeout="120s" --for=condition=ready=true pod -n openshift-marketplace -l olm.catalogSource=certified-operators-416-amd
sed -i'' -e "s/certified-operators/certified-operators-416-amd/g" "$GPU_INSTALL_DIR/amd_gpu_install.yaml"
fi
}

applyWorkaroundForOlderOCPVersions
check_registry
status=$?

Expand All @@ -189,10 +215,7 @@ fi
sleep 120
wait_while 1800 ! machineconfig_updates

echo "Installing NFD operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
wait_while 360 ! has_csv_succeeded openshift-nfd nfd
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
/bin/bash tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh
echo "Installing KMM operator"
oc apply -f "$GPU_INSTALL_DIR/kmm_operator_install.yaml"
wait_while 360 ! has_csv_succeeded openshift-kmm kernel-module-management
Expand Down
29 changes: 29 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
set -e

NFD_INSTALL_DIR="$(dirname "$0")"
NFD_INSTANCE=$NFD_INSTALL_DIR/nfd_deploy.yaml
echo "Installing NFD operator"
oc apply -f "$NFD_INSTALL_DIR/nfd_operator.yaml"
oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd

ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"')
IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion"
xyVersion="${ocpVersionSplit[0]}.${ocpVersionSplit[1]}"
declare -A images=(
["4.14"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery@sha256:2977e67a413882efbfb90b52facf65d38a5cb2cd7a232ca3a69476e5dec33319"
["4.15"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:661b6697dee34626a3a98b50cdba787402ab214d2807b8460df92e3c79cdfcc5"
["4.16"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:bb95bc317ab78e8af4ef34dd66f9f62c2f8c261dfb5eab40918142812802f8b7"
["4.17"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:154cf3f1ddaf895d7ecd04947bd455a930132f72acc6e8bde8c26bc123184ace"
# 4.18 is a pre-release image. We need to update it later
["4.18"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:510cb4351253492455664b6c323f54dc2f6f2f8791c5e92ba6b7e60b8adb357c"
)
if [ "${images[$xyVersion]}" ]; then
imageUrl="${images[$xyVersion]}"
echo "Using image SHA for $xyVersion: $imageUrl"
else
imageUrl="${images["4.17"]}"
echo "WARNING: I don't know the sha for $xyVersion. Re-using default 4.17 $imageUrl. It might not work!"
fi
sed -i'' -e "s/<imageUrl>/$imageUrl/g" $NFD_INSTANCE
oc apply -f "$NFD_INSTANCE"
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ spec:
instance: "" # instance is empty by default
topologyupdater: false # False by default
operand:
# Image digest for registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11
image: registry.redhat.io/openshift4/ose-node-feature-discovery@sha256:d6242132d2ddec00c46d22b63015a33af821eace0150ba47d185cd992fee317d
# Image URL example: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11
image: <imageUrl>
imagePullPolicy: Always
workerConfig:
configData: |
Expand Down
6 changes: 2 additions & 4 deletions ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@ CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketpla
sed -i'' -e "0,/v1.11/s//$CHANNEL/g" "$GPU_INSTALL_DIR/gpu_install.yaml"

oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml"
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"
/bin/bash tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd
echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified

Expand Down Expand Up @@ -88,7 +87,6 @@ function rerun_accelerator_migration() {
}

wait_until_pod_ready_status "gpu-operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600
Expand Down
61 changes: 60 additions & 1 deletion ods_ci/tests/Resources/CLI/ModelServing/llm.resource
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ ${SERVICEMESH_CR_NS}= istio-system
... vllm-runtime=${LLM_RESOURCES_DIRPATH}/serving_runtimes/vllm_servingruntime_{{protocol}}.yaml
... ovms-runtime=${LLM_RESOURCES_DIRPATH}/serving_runtimes/ovms_servingruntime_{{protocol}}.yaml
... caikit-standalone-runtime=${LLM_RESOURCES_DIRPATH}/serving_runtimes/caikit_standalone_servingruntime_{{protocol}}.yaml # robocop: disable
... triton-kserve-runtime=${LLM_RESOURCES_DIRPATH}/serving_runtimes/triton_servingruntime_{{protocol}}.yaml # robocop: disable
${DOWNLOAD_PVC_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/download_model_in_pvc.yaml
${DOWNLOAD_PVC_FILLED_FILEPATH}= ${LLM_RESOURCES_DIRPATH}/download_model_in_pvc_filled.yaml

Expand Down Expand Up @@ -140,7 +141,7 @@ Compile Inference Service YAML
[Arguments] ${isvc_name} ${model_storage_uri} ${model_format}=caikit ${serving_runtime}=caikit-tgis-runtime
... ${kserve_mode}=${NONE} ${sa_name}=${DEFAULT_BUCKET_SA_NAME} ${canaryTrafficPercent}=${EMPTY} ${min_replicas}=1
... ${scaleTarget}=1 ${scaleMetric}=concurrency ${auto_scale}=${NONE}
... ${requests_dict}=&{EMPTY} ${limits_dict}=&{EMPTY} ${overlays}=${EMPTY}
... ${requests_dict}=&{EMPTY} ${limits_dict}=&{EMPTY} ${overlays}=${EMPTY} ${version}=${EMPTY}
IF '${auto_scale}' == '${NONE}'
${scaleTarget}= Set Variable ${EMPTY}
${scaleMetric}= Set Variable ${EMPTY}
Expand All @@ -153,6 +154,7 @@ Compile Inference Service YAML
Set Test Variable ${scaleMetric}
Set Test Variable ${canaryTrafficPercent}
Set Test Variable ${model_format}
Set Test Variable ${version}
Set Test Variable ${serving_runtime}
IF len($overlays) > 0
FOR ${index} ${overlay} IN ENUMERATE @{overlays}
Expand Down Expand Up @@ -414,6 +416,46 @@ Query Model Multiple Times
END
END

Setup Test Variables # robocop: off=too-many-calls-in-keyword
[Documentation] Sets up variables for the Suite
[Arguments] ${model_name} ${kserve_mode}=Serverless ${use_pvc}=${FALSE} ${use_gpu}=${FALSE}
... ${model_path}=${model_name}
Set Test Variable ${model_name}
${models_names}= Create List ${model_name}
Set Test Variable ${models_names}
Set Test Variable ${model_path}
Set Test Variable ${test_namespace} ${TEST_NS}-${model_name}
IF ${use_pvc}
Set Test Variable ${storage_uri} pvc://${model_name}-claim/${model_path}
ELSE
Set Test Variable ${storage_uri} s3://${S3.BUCKET_1.NAME}/${model_path}
END
IF ${use_gpu}
${supported_gpu_type}= Convert To Lowercase ${GPU_TYPE}
Set Runtime Image ${supported_gpu_type}
IF "${supported_gpu_type}" == "nvidia"
${limits}= Create Dictionary nvidia.com/gpu=1
ELSE IF "${supported_gpu_type}" == "amd"
${limits}= Create Dictionary amd.com/gpu=1
ELSE
FAIL msg=Provided GPU type is not yet supported. Only nvidia and amd gpu type are supported
END
Set Test Variable ${limits}
ELSE
Set Test Variable ${limits} &{EMPTY}
END
IF "${KSERVE_MODE}" == "RawDeployment" # robocop: off=inconsistent-variable-name
Set Test Variable ${use_port_forwarding} ${TRUE}
ELSE
Set Test Variable ${use_port_forwarding} ${FALSE}
END
Set Log Level NONE
Set Test Variable ${access_key_id} ${S3.AWS_ACCESS_KEY_ID}
Set Test Variable ${access_key} ${S3.AWS_SECRET_ACCESS_KEY}
Set Test Variable ${endpoint} ${MODELS_BUCKET.ENDPOINT}
Set Test Variable ${region} ${MODELS_BUCKET.REGION}
Set Log Level INFO

Compile Deploy And Query LLM model
[Documentation] Group together the test steps for preparing, deploying
... and querying a model
Expand Down Expand Up @@ -909,3 +951,20 @@ Remove Model Mount Path From Runtime
${rc} ${out}= Run And Return Rc And Output
... oc patch servingruntime ${runtime} -n ${namespace} --type='json' -p='[{"op": "remove", "path": "/spec/containers/0/args/1"}]'
Should Be Equal As Integers ${rc} ${0} msg=${out}


Set Runtime Image
[Documentation] Sets up runtime variables for the Suite
[Arguments] ${gpu_type}
IF "${RUNTIME_IMAGE}" == "${EMPTY}"
IF "${gpu_type}" == "nvidia"
Set Test Variable ${runtime_image} quay.io/modh/vllm@sha256:c86ff1e89c86bc9821b75d7f2bbc170b3c13e3ccf538bf543b1110f23e056316
ELSE IF "${gpu_type}" == "amd"
Set Test Variable ${runtime_image} quay.io/modh/vllm@sha256:10f09eeca822ebe77e127aad7eca2571f859a5536a6023a1baffc6764bcadc6e
ELSE
FAIL msg=Provided GPU type is not yet supported. Only nvidia and amd gpu type are supported
END
ELSE
Log To Console msg= Using the image provided from terminal
END

18 changes: 9 additions & 9 deletions ods_ci/tests/Resources/Common.robot
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,9 @@ Get All Text Under Element
${elements}= Get WebElements ${parent_element}
${text_list}= Create List
FOR ${element} IN @{elements}
${text}= Run Keyword And Ignore Error
${status} ${text}= Run Keyword And Ignore Error
... Get Element Attribute ${element} textContent
Append To List ${text_list} ${text}
Run Keyword If '${status}' == 'PASS' Append To List ${text_list} ${text}
END
RETURN ${text_list}

Expand Down Expand Up @@ -579,7 +579,7 @@ Clone Git Repository
Get Operator Starting Version
[Documentation] Returns the starting version of the operator in the upgrade chain
${rc} ${out}= Run And Return RC And Output
... oc get subscription rhods-operator -n ${OPERATOR_NAMESPACE} -o yaml | yq '.spec.startingCSV' | awk -F. '{print $2"."$3"."$4}' # robocop: disable
... oc get subscription rhods-operator -n ${OPERATOR_NAMESPACE} -o yaml | yq -r '.spec.startingCSV' | awk -F. '{print $2"."$3"."$4}' # robocop: disable
Should Be Equal As Integers ${rc} 0
RETURN ${out}

Expand All @@ -601,14 +601,14 @@ Skip If Operator Starting Version Is Not Supported
Skip If condition="${supported}"=="${FALSE}" msg=This test is skipped because starting operator version < ${minimum_version}

Skip If Cluster Type Is Self-Managed
[Documentation] Skips test if cluster type is Self-managed
${cluster_type}= Is Cluster Type Self-Managed
Skip If condition=${cluster_type}==True msg=This test is skipped for Self-managed cluster
[Documentation] Skips test if cluster type is Self-managed
${cluster_type}= Is Cluster Type Managed
Skip If condition=${cluster_type}==False msg=This test is skipped for Self-managed cluster

Skip If Cluster Type Is Managed
[Documentation] Skips test if cluster type is Managed
${cluster_type}= Is Cluster Type Self-Managed
Skip If condition=${cluster_type}==False msg=This test is skipped for Managed cluster
[Documentation] Skips test if cluster type is Managed
${cluster_type}= Is Cluster Type Managed
Skip If condition=${cluster_type}==True msg=This test is skipped for Managed cluster

Delete All ${resource_type} In Namespace By Name
[Documentation] Force delete all ${resource_type} named '${resource_type}' in namespace '${namespace}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ spec:
volumeMounts: []
modelFormat:
name: ${model_format}
version: ${version}
runtime: ${serving_runtime}
storageUri: ${model_storage_uri}
volumes: []
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
apiVersion: serving.kserve.io/v1alpha1
kind: ServingRuntime
metadata:
name: triton-kserve-runtime
spec:
annotations:
prometheus.kserve.io/path: /metrics
prometheus.kserve.io/port: "8002"
containers:
- args:
- tritonserver
- --model-store=/mnt/models
- --grpc-port=9000
- --http-port=8080
- --allow-grpc=true
- --allow-http=true
image: nvcr.io/nvidia/tritonserver:23.05-py3
name: kserve-container
resources:
limits:
cpu: "1"
memory: 2Gi
requests:
cpu: "1"
memory: 2Gi
ports:
- containerPort: 8080
protocol: TCP
protocolVersions:
- v2
- grpc-v2
supportedModelFormats:
- autoSelect: true
name: tensorrt
version: "8"
- autoSelect: true
name: tensorflow
version: "1"
- autoSelect: true
name: tensorflow
version: "2"
- autoSelect: true
name: onnx
version: "1"
- name: pytorch
version: "1"
- autoSelect: true
name: triton
version: "2"
- autoSelect: true
name: xgboost
version: "1"
- autoSelect: true
name: python
version: "1"
20 changes: 9 additions & 11 deletions ods_ci/tests/Resources/OCP.resource
Original file line number Diff line number Diff line change
Expand Up @@ -333,17 +333,15 @@ Wait For Namespace To Be Active
Log ${value}
Should Be Equal As Integers ${rc} 0

Is Cluster Type Self-Managed
[Documentation] Get the value of cluster type depending on the console URL domain
... Returns ${TRUE} if cluster type is Self-Managed
... Returns ${FALSE} if cluster type is Managed
${matches}= Get Regexp Matches ${OCP_CONSOLE_URL} rh-ods
${size}= Get Length ${matches}
IF ${size}>0
${domain}= Get From List ${matches} 0
IF "${domain}" == "rh-ods"
RETURN ${TRUE}
END
Is Cluster Type Managed
[Documentation] Find the cluster type based on output of the infrastructure of the cluster
... Returns ${TRUE} if cluster type is Managed
... Returns ${FALSE} if cluster type is Self-Managed
${rc} ${output}= Run And Return Rc And Output
... oc get infrastructure cluster -o jsonpath='{.status.platformStatus.*.resourceTags[?(@.key=="red-hat-managed")].value}'
Should Be Equal As Integers ${rc} 0
IF "${output}" == "true"
RETURN ${TRUE}
ELSE
RETURN ${FALSE}
END
Expand Down
17 changes: 11 additions & 6 deletions ods_ci/tests/Resources/Page/Components/Menu.robot
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,22 @@ Resource ../ODH/ODHDashboard/ODHDashboard.robot
Library String
Library JupyterLibrary


*** Variables ***
${SIDEBAR_XP} //div[@id="page-sidebar"]


*** Keywords ***
Navigate To Page
[Arguments]
... ${menu}
... ${submenu}=${NONE}
... ${timeout}=10s
Wait Until Element Is Visible //div[@id="page-sidebar"] timeout=${timeout}
Wait Until Element Is Visible ${SIDEBAR_XP} timeout=${timeout}
Wait Until Page Contains ${menu}
${menu}= Set Variable If "${menu}" == "Deployed models" Model Serving ${menu}
IF "${submenu}" == "${NONE}" Run Keyword And Return
... Click Link ${menu}
... Click Button ${SIDEBAR_XP}//button[text()="${menu}"]
${is_menu_expanded}= Menu.Is Menu Expanded ${menu}
IF "${is_menu_expanded}" == "false" Menu.Click Menu ${menu}
Wait Until Page Contains ${submenu}
Expand All @@ -23,20 +28,20 @@ Navigate To Page
Click Menu
[Arguments]
... ${menu}
Click Element //button[text()="${menu}"]
Click Element ${SIDEBAR_XP}//button[text()="${menu}"]

Click Submenu
[Arguments]
... ${submenu}
Click Element //a[text()="${submenu}"]
Click Element ${SIDEBAR_XP}//a[text()="${submenu}"]

Is Menu Expanded
[Arguments]
... ${menu}
${is_menu_expanded}= Get Element Attribute //button[text()="${menu}"] attribute=aria-expanded
${is_menu_expanded}= Get Element Attribute ${SIDEBAR_XP}//button[text()="${menu}"] attribute=aria-expanded
RETURN ${is_menu_expanded}

Page Should Contain Menu
[Arguments] ${menu}
Page Should Contain Element //button[text()="${menu}"]
Page Should Contain Element ${SIDEBAR_XP}//button[text()="${menu}"]

Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ Resource ../../Common.robot

*** Variables ***
${PROJECT_XP}= xpath=//div[text()='Project']
${DISTRIBUITED_WORKLOAD_METRICS_TITLE_XP}= xpath=//h1[text()="Distributed Workload Metrics"]
${DISTRIBUITED_WORKLOAD_METRICS_TEXT_XP}= xpath=//div[text()='Monitor the metrics of your active resources.']
${PROJECT_METRICS_TAB_XP}= xpath=//button[@aria-label="Project metrics tab"]
${WORKLOAD_STATUS_TAB_XP}= xpath=//button[@aria-label="Distributed workload status tab"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ Open Pipeline Run
Wait Until Page Contains Element xpath=//*[@data-testid="active-runs-tab"] timeout=30s
Click Element xpath=//*[@data-testid="active-runs-tab"]
Wait Until Page Contains Element xpath=//span[text()='${pipeline_run_name}']
Click Element xpath=//span[text()='${pipeline_run_name}']
Click Element xpath=//td[@data-label="Name"]//span[contains(text(), '${pipeline_run_name}')]
Wait Until Page Contains Element xpath=//div[@data-test-id='topology']

# robocop: disable:line-too-long
Expand Down
Loading

0 comments on commit ca19a86

Please sign in to comment.