From 6b89749e758cabc223edfdf5b34aeaa361d453a0 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Sat, 7 Oct 2023 17:55:51 +0200 Subject: [PATCH 01/19] fix minimal-cuda-test --- .../ODH/JupyterHub/JupyterHubSpawner.robot | 53 +++++++++++++------ .../500__jupyterhub/autoscaling-gpus.robot | 3 +- 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot b/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot index f2e95480f..f1876bfd4 100644 --- a/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot +++ b/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot @@ -18,7 +18,13 @@ ${KFNBC_SPAWNER_HEADER_XPATH} = //h1[.="Start a notebook server"] ${JUPYTERHUB_DROPDOWN_XPATH} = //button[@aria-label="Options menu"] ${KFNBC_CONTAINER_SIZE_TITLE} = //div[.="Deployment size"]/..//span[.="Container Size"] ${KFNBC_CONTAINER_SIZE_DROPDOWN_XPATH} = //label[@for="modal-notebook-container-size"]/../..//button[@aria-label="Options menu"] -${KFNBC_GPU_DROPDOWN_XPATH} = //button[contains(@aria-labelledby, "gpu-numbers")] +${KFNBC_ACCELERATOR_HEADER_XPATH} = //span[text()='Accelerator'] +${KFNBC_ACCELERATOR_DROPDOWN_XPATH} = //label[@for='modal-notebook-accelerator']/ancestor::div[@class='pf-c-form__group']/descendant::button +${KFNBC_ACCELERATOR_DROPDOWN_NVIDIA_XPATH} = //div[@class and text()='Nvidia GPU'] +${KFNBC_ACCELERATOR_INPUT_XPATH} = //input[@aria-label='Number of accelerators'] +${KFNBC_ACCELERATOR_LESS_BUTTON_XPATH} = ${KFNBC_ACCELERATOR_INPUT_XPATH}/preceding-sibling::button +${KFNBC_ACCELERATOR_PLUS_BUTTON_XPATH} = ${KFNBC_ACCELERATOR_INPUT_XPATH}/following-sibling::button +${KFNBC_MAX_ACCELERATOR_WARNING_XPATH} = //div[@aria-label='Warning Alert']//h4[contains(text(), 'accelerator detected')] ${KFNBC_MODAL_HEADER_XPATH} = //div[@aria-label="Starting server modal"] ${KFNBC_MODAL_CANCEL_XPATH} = ${KFNBC_MODAL_HEADER_XPATH}//button[.="Cancel"] ${KFNBC_MODAL_CLOSE_XPATH} = ${KFNBC_MODAL_HEADER_XPATH}//button[.="Close"] @@ -92,26 +98,40 @@ Select Container Size Click Element xpath:${JUPYTERHUB_DROPDOWN_XPATH}\[1] Click Element xpath://span[.="${container_size}"]/../.. -Wait Until GPU Dropdown Exists - [Documentation] Verifies that the dropdown to select the no. of GPUs exists - Wait Until Page Contains Number of GPUs +Wait until Accelerator Dropdown Exists + [Documentation] Verifies that the dropdown to select the Accelerator exists Page Should Not Contain All GPUs are currently in use, try again later. - Wait Until Page Contains Element xpath:${KFNBC_GPU_DROPDOWN_XPATH} - ... error=GPU selector is not present in JupyterHub Spawner + Wait Until Page Contains Element xpath:${KFNBC_ACCELERATOR_DROPDOWN_XPATH} + ... error=Accelerator selector is not present in JupyterHub Spawner -Set Number Of Required GPUs - [Documentation] Sets the gpu count based on the ${gpus} argument +Set NVidia GPU Accelerator + [Documentation] Set NVidia GPU Accelerator + Click Element xpath:${KFNBC_ACCELERATOR_DROPDOWN_XPATH} + Click Element xpath:${KFNBC_ACCELERATOR_DROPDOWN_NVIDIA_XPATH} + +Set Number Of Required Accelerators + [Documentation] Sets the Accelerators count based on the ${gpus} argument [Arguments] ${gpus} - Click Element xpath:${KFNBC_GPU_DROPDOWN_XPATH} - Click Element xpath:${KFNBC_GPU_DROPDOWN_XPATH}/../..//button[.="${gpus}"] + ${acc_num}= Get Value xpath:${KFNBC_ACCELERATOR_INPUT_XPATH} + Log Actual num of Accelerators: ${acc_num} + IF ${acc_num} != ${gpus} + Input Text ${KFNBC_ACCELERATOR_INPUT_XPATH} ${gpus} + END + Fetch Max Number Of GPUs In Spawner Page [Documentation] Returns the maximum number of GPUs a user can request from the spawner - ${gpu_visible} = Run Keyword And Return Status Wait Until GPU Dropdown Exists + ${gpu_visible} = Run Keyword And Return Status Wait until Accelerator Dropdown Exists IF ${gpu_visible}==True - Click Element xpath:${KFNBC_GPU_DROPDOWN_XPATH} - ${maxGPUs} = Get Text xpath://li[@class="pf-c-select__menu-wrapper"][last()]/button - ${maxGPUs} = Convert To Integer ${maxGPUs} + Set NVidia GPU Accelerator + ${max_operator_detected}= Run Keyword And Return Status Page Should Contain Element xpath=${KFNBC_MAX_ACCELERATOR_WARNING_XPATH} + WHILE ${max_operator_detected} == ${FALSE} + Click Element xpath:${KFNBC_ACCELERATOR_PLUS_BUTTON_XPATH} + ${max_operator_detected}= Run Keyword And Return Status Page Should Contain Element xpath=${KFNBC_MAX_ACCELERATOR_WARNING_XPATH} + ${maxGPUs} = Get Value xpath:${KFNBC_ACCELERATOR_INPUT_XPATH} + ${maxGPUs} = Convert To Integer ${maxGPUs} + ${maxGPUs} = Set Variable ${maxGPUs-1} + END ELSE ${maxGPUs} = Set Variable ${0} END @@ -262,9 +282,10 @@ Spawn Notebook With Arguments # robocop: disable IF ${spawner_ready}==True Select Notebook Image ${image} ${version} Select Container Size ${size} - ${gpu_visible} = Run Keyword And Return Status Wait Until GPU Dropdown Exists + ${gpu_visible} = Run Keyword And Return Status Wait until Accelerator Dropdown Exists IF ${gpu_visible}==True and ${gpus}>0 - Set Number Of Required GPUs ${gpus} + Set NVidia GPU Accelerator + Set Number Of Required Accelerators ${gpus} ELSE IF ${gpu_visible}==False and ${gpus}>0 IF ${index} < ${retries} Sleep 30s reason=Wait for GPU to free up diff --git a/ods_ci/tests/Tests/500__jupyterhub/autoscaling-gpus.robot b/ods_ci/tests/Tests/500__jupyterhub/autoscaling-gpus.robot index bc3d62268..4839d4538 100644 --- a/ods_ci/tests/Tests/500__jupyterhub/autoscaling-gpus.robot +++ b/ods_ci/tests/Tests/500__jupyterhub/autoscaling-gpus.robot @@ -58,7 +58,8 @@ Spawn Notebook And Trigger Autoscale ... of the GPU node. Select Notebook Image ${NOTEBOOK_IMAGE} Select Container Size Small - Set Number Of Required GPUs 1 + Set NVidia GPU Accelerator + Set Number Of Required Accelerators 1 Spawn Notebook spawner_timeout=20 minutes expect_autoscaling=${True} Run Keyword And Warn On Failure Wait Until Page Contains Log in with OpenShift timeout=15s ${oauth_prompt_visible} = Is OpenShift OAuth Login Prompt Visible From 37e240d80f6a7d07b08ce174b7b0807a669354ef Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Tue, 17 Oct 2023 14:13:09 +0200 Subject: [PATCH 02/19] fix workbenches --- .../Workbenches.resource | 54 +++++++++++++------ ...5__ods_dashboard_projects_additional.robot | 2 - 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource index 4b5aa68b1..b5714e9b4 100644 --- a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource +++ b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource @@ -12,9 +12,14 @@ ${WORKBENCH_CREATE_BTN_XP}= xpath=//button[text()="Create workbench"] ${WORKBENCH_CREATE_BTN_2_XP}= xpath=//button[@id="create-button"] ${WORKBENCH_NAME_INPUT_XP}= xpath=//input[@name="workbench-name"] ${WORKBENCH_DESCR_TXT_XP}= xpath=//textarea[@name="workbench-description"] -${WORKBENCH_IMAGE_MENU_BTN_XP}= xpath=//section[@id="notebook-image"]//button[@aria-label="Options menu"] +${WORKBENCH_IMAGE_MENU_BTN_XP}= xpath=//div[@id="workbench-image-stream-selection"]/button ${WORKBENCH_IMAGE_ITEM_BTN_XP}= xpath=//ul[@id="workbench-image-stream-selection"]/li/button -${WORKBENCH_IMAGE_ITEM_SPAN_XP}= xpath=//ul[@id="workbench-image-stream-selection"]/li//span +${WORKBENCH_IMAGE_ITEM_SPAN_XP}= xpath=//div[@id="workbench-image-stream-selection"]//li//div +${WORKBENCH_SIZE_SIDE_MENU_BTN}= xpath=//nav[@aria-label="Jump to section"]//span[text()="Deployment size"] +${WORKBENCH_ACCELERATOR_DROPDOWN_XPATH}= xpath=//label[@for='modal-notebook-accelerator']/ancestor::div[@class='pf-c-form__group']/descendant::button +${WORKBENCH_ACCELERATOR_INPUT_XPATH}= xpath=//input[@aria-label='Number of accelerators'] +${WORKBENCH_ACCELERATOR_LESS_BUTTON_XPATH}= xpath=${WORKBENCH_INPUT_XPATH}/preceding-sibling::button +${WORKBENCH_ACCELERATOR_PLUS_BUTTON_XPATH}= xpath=${WORKBENCH_ACCELERATOR_INPUT_XPATH}/following-sibling::button ${WORKBENCH_SIZE_MENU_BTN_XP}= xpath=//section[@id="deployment-size"]//button[@aria-label="Options menu"] ${WORKBENCH_SIZE_ITEM_BTN_XP}= xpath=//ul[@data-id="container-size-select"]/li/button ${WORKBENCH_GPU_MENU_BTN_XP}= xpath=//section[@id="deployment-size"]//button[contains(@aria-labelledby,"gpu-numbers")] # robocop: disable @@ -181,8 +186,8 @@ Select Workbench Jupyter Image Wait Until Page Contains Element ${WORKBENCH_IMAGE_ITEM_SPAN_XP}\[text()="TrustyAI"]/.. timeout=10s Click Element ${WORKBENCH_IMAGE_ITEM_SPAN_XP}\[text()="TrustyAI"]/.. ELSE - Wait Until Page Contains Element ${WORKBENCH_IMAGE_ITEM_BTN_XP}\[text()="${image_name}"] timeout=10s - Click Element ${WORKBENCH_IMAGE_ITEM_BTN_XP}\[text()="${image_name}"] + Wait Until Page Contains Element ${WORKBENCH_IMAGE_ITEM_SPAN_XP}\[text()="${image_name}"] timeout=10s + Click Element ${WORKBENCH_IMAGE_ITEM_SPAN_XP}\[text()="${image_name}"] IF "${version}"=="default" Verify Version Selection Dropdown ELSE IF "${version}"=="previous" @@ -221,8 +226,10 @@ Select Workbench Image Version Select Workbench Container Size [Documentation] Selects the container size in the workbench creation page [Arguments] ${size_name}=Small + Wait Until Page Contains Element ${WORKBENCH_SIZE_SIDE_MENU_BTN} + Click Element ${WORKBENCH_SIZE_SIDE_MENU_BTN} Wait Until Page Contains Element ${WORKBENCH_SIZE_MENU_BTN_XP} - Click Button ${WORKBENCH_SIZE_MENU_BTN_XP} + Click Element ${WORKBENCH_SIZE_MENU_BTN_XP} Wait Until Page Contains Element ${WORKBENCH_SIZE_ITEM_BTN_XP}/span[text()="${size_name}"] Click Element ${WORKBENCH_SIZE_ITEM_BTN_XP}/span[text()="${size_name}"] @@ -451,11 +458,31 @@ Page Should Contain Event Log Select Workbench Number Of GPUs [Documentation] Selects the container size in the workbench creation page - [Arguments] ${gpus} - Wait Until Page Contains Element ${WORKBENCH_GPU_MENU_BTN_XP} - Click Button ${WORKBENCH_GPU_MENU_BTN_XP} - Wait Until Page Contains Element ${WORKBENCH_GPU_ITEM_BTN_XP}/self::*[text()="${gpus}"] - Click Element ${WORKBENCH_GPU_ITEM_BTN_XP}/self::*[text()="${gpus}"] + [Arguments] ${gpus} ${gpu_type}='Nvidia GPU' + Wait Until Page Contains Element ${WORKBENCH_SIZE_SIDE_MENU_BTN} + Click Element ${WORKBENCH_SIZE_SIDE_MENU_BTN} + Wait Until Page Contains Element ${WORKBENCH_ACCELERATOR_DROPDOWN_XPATH} + Click Element ${WORKBENCH_ACCELERATOR_DROPDOWN_XPATH} + IF "${gpus}" == "0" + Click Element xpath=//a[text()='None'] + ELSE + # Select Accelerator Technology + Wait Until Page Contains Element xpath=//div[@class and text()=${gpu_type}] + Click Element xpath=//div[@class and text()=${gpu_type}] + # Select number of GPU units + ${actual_gpus} = Get Value ${WORKBENCH_ACCELERATOR_INPUT_XPATH} + ${actual_gpus} = Convert To Integer ${actual_gpus} + ${gpus} = Convert To Integer ${gpus} + WHILE ${actual_gpus} != ${gpus} + IF ${actual_gpus} < ${gpus} + Click Element ${WORKBENCH_ACCELERATOR_PLUS_BUTTON_XPATH} + ELSE + Click Element ${WORKBENCH_ACCELERATOR_LESS_BUTTON_XPATH} + END + ${actual_gpus} = Get Value ${WORKBENCH_ACCELERATOR_INPUT_XPATH} + ${actual_gpus} = Convert To Integer ${actual_gpus} + END + END Edit GPU Number [Documentation] Edit a workbench @@ -478,13 +505,6 @@ Delete Workbench From CLI ... workbench_title=${workbench_title} namespace=${ns_name} Oc Delete kind=Notebook name=${cr_name} namespace=${ns_name} -GPU Dropdown Should Be Disabled - [Documentation] Checks if the GPU dropdown is not able editable - [Arguments] ${workbench_title} - Click Action From Actions Menu item_title=${workbench_title} item_type=workbench action=Edit - Wait Until Page Contains Element ${WORKBENCH_GPU_MENU_BTN_XP} - Element Should Be Disabled ${WORKBENCH_GPU_MENU_BTN_XP} - Get Workbench Pod [Documentation] Retrieves info of a workbench pod: namespace, CR resource name and pod definition [Arguments] ${workbench_title} ${project_title} diff --git a/ods_ci/tests/Tests/400__ods_dashboard/415__ods_dashboard_projects/415__ods_dashboard_projects_additional.robot b/ods_ci/tests/Tests/400__ods_dashboard/415__ods_dashboard_projects/415__ods_dashboard_projects_additional.robot index c228e89c2..275407f23 100644 --- a/ods_ci/tests/Tests/400__ods_dashboard/415__ods_dashboard_projects/415__ods_dashboard_projects_additional.robot +++ b/ods_ci/tests/Tests/400__ods_dashboard/415__ods_dashboard_projects/415__ods_dashboard_projects_additional.robot @@ -116,8 +116,6 @@ Verify User Can Remove GPUs From Workbench ... pv_description=${EMPTY} pv_size=${PV_SIZE} gpus=1 Run Keyword And Continue On Failure Wait Until Workbench Is Started workbench_title=${WORKBENCH_TITLE_GPU} Sleep 10s reason=There is some delay in updating the GPU availability in Dashboard - Run Keyword And Continue On Failure GPU Dropdown Should Be Disabled workbench_title=${WORKBENCH_TITLE_GPU} - Click Button ${GENERIC_CANCEL_BTN_XP} Stop Workbench workbench_title=${WORKBENCH_TITLE_GPU} Run Keyword And Continue On Failure Wait Until Workbench Is Stopped workbench_title=${WORKBENCH_TITLE_GPU} Wait Until Keyword Succeeds 10 times 5s From b29b2cb827b23924cb79b561ad6728a2a5345e1c Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Tue, 17 Oct 2023 18:00:11 +0200 Subject: [PATCH 03/19] fix model serving --- .../ModelServer.resource | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource index adace4607..641412a92 100644 --- a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource +++ b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource @@ -18,11 +18,11 @@ ${S3_BUCKET_DC_INPUT_XP}= xpath=//input[@aria-label="AWS field AWS_S3_BUCKET ${REPLICAS_PLUS_BTN_XP}= xpath=//div/button[@aria-label="Plus"] ${REPLICAS_MIN_BTN_XP}= xpath=//div/button[@aria-label="Minus"] ${SERVING_RUNTIME_NAME}= xpath=//input[@id="serving-runtime-name-input"] -${GPU_SECTION_TITLE}= xpath=//span[.="Model server GPUs"] -${GPU_SECTION_INPUT}= ${GPU_SECTION_TITLE}/../../..//input -${GPU_SECTION_PLUS}= ${GPU_SECTION_TITLE}/../../..//button[@aria-label="Plus"] -${GPU_SECTION_MINUS}= ${GPU_SECTION_TITLE}/../../..//button[@aria-label="Minus"] - +${SERVING_ACCELERATOR_DROPDOWN_XPATH}= xpath=//label[@for='modal-notebook-accelerator']/ancestor::div[@class='pf-c-form__group']/descendant::button +${SERVING_ACCELERATOR_INPUT_XPATH}= xpath=//input[@aria-label='Number of accelerators'] +${SERVING_ACCELERATOR_LESS_BUTTON_XPATH}= xpath=${SERVING_INPUT_XPATH}/preceding-sibling::button +${SERVING_ACCELERATOR_PLUS_BUTTON_XPATH}= xpath=${SERVING_ACCELERATOR_INPUT_XPATH}/following-sibling::button +${SERVING_MODEL_SERVERS_SIDE_MENU}= xpath=//span[text()='Models and model servers'] *** Keywords *** Create Model Server @@ -42,6 +42,7 @@ Create Model Server Log GPU requested but not available Fail END + Set Accelerator Set Number of GPU With Buttons ${no_gpus} END IF ${ext_route}==${TRUE} @@ -88,13 +89,18 @@ Set Server Size Verify GPU Selector Is Usable [Documentation] Verifies that the GPU selector is present and enabled - Page Should Contain Element ${GPU_SECTION_TITLE} - Element Should Be Enabled ${GPU_SECTION_INPUT} + Page Should Contain Element ${SERVING_ACCELERATOR_DROPDOWN_XPATH} + +Set Accelerator + [Documentation] Set NVidia GPU Accelerator + [Arguments] ${accelerator}='Nvidia GPU' + Click Element ${SERVING_ACCELERATOR_DROPDOWN_XPATH} + Click Element xpath=//div[@class and text()=${accelerator}] Set Number of GPU With Buttons [Documentation] Select the number of GPUs to attach to the model server [Arguments] ${no_gpus} - ${current}= Get Element Attribute ${GPU_SECTION_INPUT} value + ${current}= Get Element Attribute ${SERVING_ACCELERATOR_INPUT_XPATH} value ${difference}= Evaluate int(${no_gpus})-int(${current}) ${op}= Set Variable plus IF ${difference}<${0} @@ -108,16 +114,16 @@ Set Number of GPU With Buttons Click GPU Minus Button END END - ${current}= Get Element Attribute ${GPU_SECTION_INPUT} value + ${current}= Get Element Attribute ${SERVING_ACCELERATOR_INPUT_XPATH} value Should Be Equal As Integers ${current} ${no_gpus} Click GPU Plus Button [Documentation] Click the plus button in the GPU selector - Click Element ${GPU_SECTION_PLUS} + Click Element ${SERVING_ACCELERATOR_PLUS_BUTTON_XPATH} Click GPU Minus Button [Documentation] Click the minus button in the GPU selector - Click Element ${GPU_SECTION_MINUS} + Click Element ${SERVING_ACCELERATOR_LESS_BUTTON_XPATH} Verify Displayed GPU Count [Documentation] Verifies the number of GPUs displayed in the Model Server table @@ -128,8 +134,10 @@ Verify Displayed GPU Count IF ${expanded}==False Click Element xpath://button[@aria-expanded="false"]/span[.="${server_name}"] END - Page Should Contain Element xpath://span[.="${server_name}"]/../../../..//span[.="Number of GPUs"] - Page Should Contain Element xpath://span[.="${server_name}"]/../../../..//span[.="Number of GPUs"]/../../dd/div[.="${no_gpus}"] + Click Element ${SERVING_MODEL_SERVERS_SIDE_MENU} + Sleep 5s reason=wait for ten second until operator goes into init state + ${current_accs} = Get Text xpath://span[text()="${server_name}"]/../../../following-sibling::tr//td[@data-label]/div/dl/div[4]/dd/div + Should Match ${current_accs} ${no_gpus} Set Model Server Runtime [Documentation] Selects a given Runtime for the model server From 7339697280974997ca44b3402e24157789527fa6 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Tue, 17 Oct 2023 22:39:57 +0200 Subject: [PATCH 04/19] Linter fixes --- .../Page/ODH/JupyterHub/JupyterHubSpawner.robot | 14 +++++++------- .../ODHDataScienceProject/ModelServer.resource | 2 +- .../ODHDataScienceProject/Workbenches.resource | 11 +++++------ 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot b/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot index f1876bfd4..a5de08f42 100644 --- a/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot +++ b/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot @@ -98,7 +98,7 @@ Select Container Size Click Element xpath:${JUPYTERHUB_DROPDOWN_XPATH}\[1] Click Element xpath://span[.="${container_size}"]/../.. -Wait until Accelerator Dropdown Exists +Wait Until Accelerator Dropdown Exists [Documentation] Verifies that the dropdown to select the Accelerator exists Page Should Not Contain All GPUs are currently in use, try again later. Wait Until Page Contains Element xpath:${KFNBC_ACCELERATOR_DROPDOWN_XPATH} @@ -112,7 +112,7 @@ Set NVidia GPU Accelerator Set Number Of Required Accelerators [Documentation] Sets the Accelerators count based on the ${gpus} argument [Arguments] ${gpus} - ${acc_num}= Get Value xpath:${KFNBC_ACCELERATOR_INPUT_XPATH} + ${acc_num} = Get Value xpath:${KFNBC_ACCELERATOR_INPUT_XPATH} Log Actual num of Accelerators: ${acc_num} IF ${acc_num} != ${gpus} Input Text ${KFNBC_ACCELERATOR_INPUT_XPATH} ${gpus} @@ -121,13 +121,13 @@ Set Number Of Required Accelerators Fetch Max Number Of GPUs In Spawner Page [Documentation] Returns the maximum number of GPUs a user can request from the spawner - ${gpu_visible} = Run Keyword And Return Status Wait until Accelerator Dropdown Exists + ${gpu_visible} = Run Keyword And Return Status Wait Until Accelerator Dropdown Exists IF ${gpu_visible}==True Set NVidia GPU Accelerator - ${max_operator_detected}= Run Keyword And Return Status Page Should Contain Element xpath=${KFNBC_MAX_ACCELERATOR_WARNING_XPATH} - WHILE ${max_operator_detected} == ${FALSE} + ${max_operator_detected} = Run Keyword And Return Status Page Should Contain Element xpath=${KFNBC_MAX_ACCELERATOR_WARNING_XPATH} + WHILE not ${max_operator_detected} Click Element xpath:${KFNBC_ACCELERATOR_PLUS_BUTTON_XPATH} - ${max_operator_detected}= Run Keyword And Return Status Page Should Contain Element xpath=${KFNBC_MAX_ACCELERATOR_WARNING_XPATH} + ${max_operator_detected} = Run Keyword And Return Status Page Should Contain Element xpath=${KFNBC_MAX_ACCELERATOR_WARNING_XPATH} ${maxGPUs} = Get Value xpath:${KFNBC_ACCELERATOR_INPUT_XPATH} ${maxGPUs} = Convert To Integer ${maxGPUs} ${maxGPUs} = Set Variable ${maxGPUs-1} @@ -282,7 +282,7 @@ Spawn Notebook With Arguments # robocop: disable IF ${spawner_ready}==True Select Notebook Image ${image} ${version} Select Container Size ${size} - ${gpu_visible} = Run Keyword And Return Status Wait until Accelerator Dropdown Exists + ${gpu_visible} = Run Keyword And Return Status Wait Until Accelerator Dropdown Exists IF ${gpu_visible}==True and ${gpus}>0 Set NVidia GPU Accelerator Set Number Of Required Accelerators ${gpus} diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource index 641412a92..851a70879 100644 --- a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource +++ b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource @@ -136,7 +136,7 @@ Verify Displayed GPU Count END Click Element ${SERVING_MODEL_SERVERS_SIDE_MENU} Sleep 5s reason=wait for ten second until operator goes into init state - ${current_accs} = Get Text xpath://span[text()="${server_name}"]/../../../following-sibling::tr//td[@data-label]/div/dl/div[4]/dd/div + ${current_accs}= Get Text xpath://span[text()="${server_name}"]/../../../following-sibling::tr//td[@data-label]/div/dl/div[4]/dd/div Should Match ${current_accs} ${no_gpus} Set Model Server Runtime diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource index 963052840..aacc7903c 100644 --- a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource +++ b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource @@ -14,7 +14,6 @@ ${WORKBENCH_NAME_INPUT_XP}= xpath=//input[@name="workbench-name"] ${WORKBENCH_DESCR_TXT_XP}= xpath=//textarea[@name="workbench-description"] ${WORKBENCH_IMAGE_MENU_BTN_XP}= xpath=//section[@id="notebook-image"]//div[@id="workbench-image-stream-selection"]/button # robocop: disable ${WORKBENCH_IMAGE_ITEM_BTN_XP}= xpath=//div[@id="workbench-image-stream-selection"]//li//div -# ${WORKBENCH_IMAGE_ITEM_SPAN_XP}= xpath=//ul[@id="workbench-image-stream-selection"]/li//span ${WORKBENCH_SIZE_MENU_BTN_XP}= xpath=//section[@id="deployment-size"]//button # Removing the attribute in case it changes like it did for the image dropdown ${WORKBENCH_SIZE_SIDE_MENU_BTN}= xpath=//nav[@aria-label="Jump to section"]//span[text()="Deployment size"] ${WORKBENCH_ACCELERATOR_DROPDOWN_XPATH}= xpath=//label[@for='modal-notebook-accelerator']/ancestor::div[@class='pf-c-form__group']/descendant::button @@ -472,17 +471,17 @@ Select Workbench Number Of GPUs Wait Until Page Contains Element xpath=//div[@class and text()=${gpu_type}] Click Element xpath=//div[@class and text()=${gpu_type}] # Select number of GPU units - ${actual_gpus} = Get Value ${WORKBENCH_ACCELERATOR_INPUT_XPATH} - ${actual_gpus} = Convert To Integer ${actual_gpus} - ${gpus} = Convert To Integer ${gpus} + ${actual_gpus}= Get Value ${WORKBENCH_ACCELERATOR_INPUT_XPATH} + ${actual_gpus}= Convert To Integer ${actual_gpus} + ${gpus}= Convert To Integer ${gpus} WHILE ${actual_gpus} != ${gpus} IF ${actual_gpus} < ${gpus} Click Element ${WORKBENCH_ACCELERATOR_PLUS_BUTTON_XPATH} ELSE Click Element ${WORKBENCH_ACCELERATOR_LESS_BUTTON_XPATH} END - ${actual_gpus} = Get Value ${WORKBENCH_ACCELERATOR_INPUT_XPATH} - ${actual_gpus} = Convert To Integer ${actual_gpus} + ${actual_gpus}= Get Value ${WORKBENCH_ACCELERATOR_INPUT_XPATH} + ${actual_gpus}= Convert To Integer ${actual_gpus} END END From cbf6eec202c7ac338bdaeea757e0597f81b7df61 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Wed, 18 Oct 2023 13:27:26 +0200 Subject: [PATCH 05/19] PR Fixes --- .../ODH/JupyterHub/JupyterHubSpawner.robot | 20 +++++++++---------- .../100__installation/102__post_install.robot | 15 ++------------ 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot b/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot index a5de08f42..6925a6bfe 100644 --- a/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot +++ b/ods_ci/tests/Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot @@ -20,7 +20,6 @@ ${KFNBC_CONTAINER_SIZE_TITLE} = //div[.="Deployment size"]/..//span[.="Contai ${KFNBC_CONTAINER_SIZE_DROPDOWN_XPATH} = //label[@for="modal-notebook-container-size"]/../..//button[@aria-label="Options menu"] ${KFNBC_ACCELERATOR_HEADER_XPATH} = //span[text()='Accelerator'] ${KFNBC_ACCELERATOR_DROPDOWN_XPATH} = //label[@for='modal-notebook-accelerator']/ancestor::div[@class='pf-c-form__group']/descendant::button -${KFNBC_ACCELERATOR_DROPDOWN_NVIDIA_XPATH} = //div[@class and text()='Nvidia GPU'] ${KFNBC_ACCELERATOR_INPUT_XPATH} = //input[@aria-label='Number of accelerators'] ${KFNBC_ACCELERATOR_LESS_BUTTON_XPATH} = ${KFNBC_ACCELERATOR_INPUT_XPATH}/preceding-sibling::button ${KFNBC_ACCELERATOR_PLUS_BUTTON_XPATH} = ${KFNBC_ACCELERATOR_INPUT_XPATH}/following-sibling::button @@ -104,18 +103,19 @@ Wait Until Accelerator Dropdown Exists Wait Until Page Contains Element xpath:${KFNBC_ACCELERATOR_DROPDOWN_XPATH} ... error=Accelerator selector is not present in JupyterHub Spawner -Set NVidia GPU Accelerator - [Documentation] Set NVidia GPU Accelerator +Set GPU Accelerator + [Documentation] Set Accelerator type + [Arguments] ${accelerator_type}='Nvidia GPU' Click Element xpath:${KFNBC_ACCELERATOR_DROPDOWN_XPATH} - Click Element xpath:${KFNBC_ACCELERATOR_DROPDOWN_NVIDIA_XPATH} + Click Element xpath://div[@class and text()=${accelerator_type}] Set Number Of Required Accelerators - [Documentation] Sets the Accelerators count based on the ${gpus} argument - [Arguments] ${gpus} + [Documentation] Sets the Accelerators count based on the ${accelerators} argument + [Arguments] ${accelerators} ${acc_num} = Get Value xpath:${KFNBC_ACCELERATOR_INPUT_XPATH} Log Actual num of Accelerators: ${acc_num} - IF ${acc_num} != ${gpus} - Input Text ${KFNBC_ACCELERATOR_INPUT_XPATH} ${gpus} + IF ${acc_num} != ${accelerators} + Input Text ${KFNBC_ACCELERATOR_INPUT_XPATH} ${accelerators} END @@ -123,7 +123,7 @@ Fetch Max Number Of GPUs In Spawner Page [Documentation] Returns the maximum number of GPUs a user can request from the spawner ${gpu_visible} = Run Keyword And Return Status Wait Until Accelerator Dropdown Exists IF ${gpu_visible}==True - Set NVidia GPU Accelerator + Set GPU Accelerator ${max_operator_detected} = Run Keyword And Return Status Page Should Contain Element xpath=${KFNBC_MAX_ACCELERATOR_WARNING_XPATH} WHILE not ${max_operator_detected} Click Element xpath:${KFNBC_ACCELERATOR_PLUS_BUTTON_XPATH} @@ -284,7 +284,7 @@ Spawn Notebook With Arguments # robocop: disable Select Container Size ${size} ${gpu_visible} = Run Keyword And Return Status Wait Until Accelerator Dropdown Exists IF ${gpu_visible}==True and ${gpus}>0 - Set NVidia GPU Accelerator + Set GPU Accelerator Set Number Of Required Accelerators ${gpus} ELSE IF ${gpu_visible}==False and ${gpus}>0 IF ${index} < ${retries} diff --git a/ods_ci/tests/Tests/100__deploy/100__installation/102__post_install.robot b/ods_ci/tests/Tests/100__deploy/100__installation/102__post_install.robot index 59993164c..4877577a4 100644 --- a/ods_ci/tests/Tests/100__deploy/100__installation/102__post_install.robot +++ b/ods_ci/tests/Tests/100__deploy/100__installation/102__post_install.robot @@ -59,20 +59,10 @@ Verify GPU Operator Deployment # robocop: disable # Before GPU Node is added to the cluster # NS - Verify Namespace Status label=kubernetes.io/metadata.name=redhat-nvidia-gpu-addon - # Node-Feature-Discovery Operator - Verify Operator Status label=operators.coreos.com/ose-nfd.redhat-nvidia-gpu-addon - ... operator_name=ose-nfd.* + Verify Namespace Status label=kubernetes.io/metadata.name=nvidia-gpu-operator # GPU Operator - Verify Operator Status label=operators.coreos.com/gpu-operator-certified.redhat-nvidia-gpu-addon + Verify Operator Status label=operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator ... operator_name=gpu-operator-certified.v* - # nfd-controller-manager - Verify Deployment Status label=operators.coreos.com/ose-nfd.redhat-nvidia-gpu-addon - ... dname=nfd-controller-manager - # nfd-master - Verify DaemonSet Status label=app=nfd-master dsname=nfd-master - # nfd-worker - Verify DaemonSet Status label=app=nfd-worker dsname=nfd-worker # After GPU Node is added to the cluster Verify DaemonSet Status label=app=gpu-feature-discovery dsname=gpu-feature-discovery @@ -84,7 +74,6 @@ Verify GPU Operator Deployment # robocop: disable # Verify DaemonSet Status label=app=nvidia-driver-daemonset-* dsname=nvidia-driver-daemonset-* Verify DaemonSet Status label=app=nvidia-node-status-exporter dsname=nvidia-node-status-exporter Verify DaemonSet Status label=app=nvidia-operator-validator dsname=nvidia-operator-validator - Verify CR Status crd=NodeFeatureDiscovery cr_name=ocp-gpu-addon Verify That Prometheus Image Is A CPaaS Built Image [Documentation] Verifies the images used for prometheus From 58f70868872c278508328aff89deadb11a0cc419 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Wed, 18 Oct 2023 15:42:00 +0200 Subject: [PATCH 06/19] PR fixes --- .../ODHDashboard/ODHDataScienceProject/ModelServer.resource | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource index 851a70879..8daa1cbbf 100644 --- a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource +++ b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource @@ -20,7 +20,7 @@ ${REPLICAS_MIN_BTN_XP}= xpath=//div/button[@aria-label="Minus"] ${SERVING_RUNTIME_NAME}= xpath=//input[@id="serving-runtime-name-input"] ${SERVING_ACCELERATOR_DROPDOWN_XPATH}= xpath=//label[@for='modal-notebook-accelerator']/ancestor::div[@class='pf-c-form__group']/descendant::button ${SERVING_ACCELERATOR_INPUT_XPATH}= xpath=//input[@aria-label='Number of accelerators'] -${SERVING_ACCELERATOR_LESS_BUTTON_XPATH}= xpath=${SERVING_INPUT_XPATH}/preceding-sibling::button +${SERVING_ACCELERATOR_MINUS_BUTTON_XPATH}= xpath=${SERVING_INPUT_XPATH}/preceding-sibling::button ${SERVING_ACCELERATOR_PLUS_BUTTON_XPATH}= xpath=${SERVING_ACCELERATOR_INPUT_XPATH}/following-sibling::button ${SERVING_MODEL_SERVERS_SIDE_MENU}= xpath=//span[text()='Models and model servers'] @@ -123,7 +123,7 @@ Click GPU Plus Button Click GPU Minus Button [Documentation] Click the minus button in the GPU selector - Click Element ${SERVING_ACCELERATOR_LESS_BUTTON_XPATH} + Click Element ${SERVING_ACCELERATOR_MINUS_BUTTON_XPATH} Verify Displayed GPU Count [Documentation] Verifies the number of GPUs displayed in the Model Server table @@ -135,7 +135,7 @@ Verify Displayed GPU Count Click Element xpath://button[@aria-expanded="false"]/span[.="${server_name}"] END Click Element ${SERVING_MODEL_SERVERS_SIDE_MENU} - Sleep 5s reason=wait for ten second until operator goes into init state + Sleep 5s reason=Sometimes the number of current Accelerators take a few seconds to update ${current_accs}= Get Text xpath://span[text()="${server_name}"]/../../../following-sibling::tr//td[@data-label]/div/dl/div[4]/dd/div Should Match ${current_accs} ${no_gpus} From 7e13d55eb6a7ea956d90c1cd44cb20b1cf47fbc1 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Thu, 19 Oct 2023 16:22:48 +0200 Subject: [PATCH 07/19] fix PR comment: Generic accelerator setter --- .../ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource index 8daa1cbbf..a22c54dc5 100644 --- a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource +++ b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource @@ -92,7 +92,7 @@ Verify GPU Selector Is Usable Page Should Contain Element ${SERVING_ACCELERATOR_DROPDOWN_XPATH} Set Accelerator - [Documentation] Set NVidia GPU Accelerator + [Documentation] Set GPU Accelerator [Arguments] ${accelerator}='Nvidia GPU' Click Element ${SERVING_ACCELERATOR_DROPDOWN_XPATH} Click Element xpath=//div[@class and text()=${accelerator}] From 734a405e328865847f91ab2e0d312bd50d5d7396 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Fri, 20 Oct 2023 17:20:23 +0200 Subject: [PATCH 08/19] Add a rerun migration for accelerators in gpu deploy script --- .../Resources/Provisioning/GPU/gpu_deploy.sh | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index e8f56de12..de08739c7 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -40,9 +40,54 @@ function wait_until_gpu_pods_are_running() { } +function rerun_accelerator_migration() { +#As we are adding the GPUs after installing the operator, those GPUs are not discovered automatically. +#In order to rerun the migration we need to +#1. Delete the migration configmap +#2. Delete the dashboard replicaset to trigger new pods +#Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938 + + local timeout_seconds=600 + local sleep_time=5 + + echo "Deleting configmap migration-gpu-status" + oc delete configmap migration-gpu-status -n redhat-ods-applications + if [[ $? -ne 0 ]] + then + printf "ERROR: When trying to delete the migration-gpu-status configmap\n" + return 1 + fi + + dashboard_rs=$(oc get rs -n redhat-ods-applications | grep rhods-dashboard- | awk '{print $1;exit}') + echo "Deleting ReplicaSet $dashboard_rs" + oc delete rs $dashboard_rs -n redhat-ods-applications + if [[ $? -ne 0 ]] + then + printf "ERROR: When trying to delete the dashboard replica set\n" + return 1 + fi + + # Wait until all dashboard pods are ready again + SECONDS=0 + while [ "$SECONDS" -le "$timeout_seconds" ]; do + dashboard_pods=$(oc get deployment rhods-dashboard -n redhat-ods-applications | grep rhods-dashboard | awk '{print $2;exit}') + dashboard_pods_total=`echo $dashboard_pods | cut -c3-3` + dashboard_pods_avail=`echo $dashboard_pods | cut -c1-1` + ((remaining_seconds = timeout_seconds - SECONDS)) + echo "Dashboard pods: Available $dashboard_pods_avail out of $dashboard_pods_total ... (timeout in $remaining_seconds seconds)" + if [ $dashboard_pods_avail == $dashboard_pods_total ]; then + break + else + sleep $sleep_time + ((SECONDS+=$sleep_time)) + fi + done +} + wait_until_gpu_pods_are_running oc apply -f ${GPU_INSTALL_DIR}/nfd_deploy.yaml oc get csv -n nvidia-gpu-operator $CSVNAME -ojsonpath={.metadata.annotations.alm-examples} | jq .[0] > clusterpolicy.json oc apply -f clusterpolicy.json +rerun_accelerator_migration From c3ad5217ce633af2634b817241b7a206ac0d0d1a Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Fri, 20 Oct 2023 17:26:11 +0200 Subject: [PATCH 09/19] PR fixes --- ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index de08739c7..19c38d483 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -51,8 +51,7 @@ function rerun_accelerator_migration() { local sleep_time=5 echo "Deleting configmap migration-gpu-status" - oc delete configmap migration-gpu-status -n redhat-ods-applications - if [[ $? -ne 0 ]] + if ! oc delete configmap migration-gpu-status -n redhat-ods-applications; then printf "ERROR: When trying to delete the migration-gpu-status configmap\n" return 1 @@ -60,8 +59,7 @@ function rerun_accelerator_migration() { dashboard_rs=$(oc get rs -n redhat-ods-applications | grep rhods-dashboard- | awk '{print $1;exit}') echo "Deleting ReplicaSet $dashboard_rs" - oc delete rs $dashboard_rs -n redhat-ods-applications - if [[ $? -ne 0 ]] + if ! oc delete rs $dashboard_rs -n redhat-ods-applications; then printf "ERROR: When trying to delete the dashboard replica set\n" return 1 @@ -71,8 +69,8 @@ function rerun_accelerator_migration() { SECONDS=0 while [ "$SECONDS" -le "$timeout_seconds" ]; do dashboard_pods=$(oc get deployment rhods-dashboard -n redhat-ods-applications | grep rhods-dashboard | awk '{print $2;exit}') - dashboard_pods_total=`echo $dashboard_pods | cut -c3-3` - dashboard_pods_avail=`echo $dashboard_pods | cut -c1-1` + dashboard_pods_total=$(echo $dashboard_pods | cut -c3-3) + dashboard_pods_avail=$(echo $dashboard_pods | cut -c1-1) ((remaining_seconds = timeout_seconds - SECONDS)) echo "Dashboard pods: Available $dashboard_pods_avail out of $dashboard_pods_total ... (timeout in $remaining_seconds seconds)" if [ $dashboard_pods_avail == $dashboard_pods_total ]; then From a355851508ed88ac6e9981638a994ac76c119732 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Mon, 23 Oct 2023 17:18:53 +0200 Subject: [PATCH 10/19] Update ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index 19c38d483..a4df6c26d 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -73,7 +73,7 @@ function rerun_accelerator_migration() { dashboard_pods_avail=$(echo $dashboard_pods | cut -c1-1) ((remaining_seconds = timeout_seconds - SECONDS)) echo "Dashboard pods: Available $dashboard_pods_avail out of $dashboard_pods_total ... (timeout in $remaining_seconds seconds)" - if [ $dashboard_pods_avail == $dashboard_pods_total ]; then + if [ "$dashboard_pods_avail" == "$dashboard_pods_total" ]; then break else sleep $sleep_time From a01fe7157d3856781804e3ef898f8c3a68533284 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Mon, 23 Oct 2023 17:19:12 +0200 Subject: [PATCH 11/19] Update ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index a4df6c26d..3322efbbe 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -69,8 +69,8 @@ function rerun_accelerator_migration() { SECONDS=0 while [ "$SECONDS" -le "$timeout_seconds" ]; do dashboard_pods=$(oc get deployment rhods-dashboard -n redhat-ods-applications | grep rhods-dashboard | awk '{print $2;exit}') - dashboard_pods_total=$(echo $dashboard_pods | cut -c3-3) - dashboard_pods_avail=$(echo $dashboard_pods | cut -c1-1) + dashboard_pods_total=$(echo "$dashboard_pods" | cut -c3-3) + dashboard_pods_avail=$(echo "$dashboard_pods" | cut -c1-1) ((remaining_seconds = timeout_seconds - SECONDS)) echo "Dashboard pods: Available $dashboard_pods_avail out of $dashboard_pods_total ... (timeout in $remaining_seconds seconds)" if [ "$dashboard_pods_avail" == "$dashboard_pods_total" ]; then From 043a1989d57f0982e6a3f1f7d27e8ea30274692a Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Mon, 23 Oct 2023 17:19:58 +0200 Subject: [PATCH 12/19] Update ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index 3322efbbe..4d71d39d3 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -59,7 +59,7 @@ function rerun_accelerator_migration() { dashboard_rs=$(oc get rs -n redhat-ods-applications | grep rhods-dashboard- | awk '{print $1;exit}') echo "Deleting ReplicaSet $dashboard_rs" - if ! oc delete rs $dashboard_rs -n redhat-ods-applications; + if ! oc delete rs "$dashboard_rs" -n redhat-ods-applications; then printf "ERROR: When trying to delete the dashboard replica set\n" return 1 From 097807a9418875e752a70eeb7734183961fb9d70 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Mon, 23 Oct 2023 17:21:55 +0200 Subject: [PATCH 13/19] PR fixes --- .../ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource index a22c54dc5..5d6242d8b 100644 --- a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource +++ b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource @@ -92,7 +92,7 @@ Verify GPU Selector Is Usable Page Should Contain Element ${SERVING_ACCELERATOR_DROPDOWN_XPATH} Set Accelerator - [Documentation] Set GPU Accelerator + [Documentation] Set Accelerator [Arguments] ${accelerator}='Nvidia GPU' Click Element ${SERVING_ACCELERATOR_DROPDOWN_XPATH} Click Element xpath=//div[@class and text()=${accelerator}] From 1c7c2807cc825d35eb7c592912d100e7339b626f Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Wed, 25 Oct 2023 12:16:57 +0200 Subject: [PATCH 14/19] delete one pod instead the complete dashboard replica set --- .../Resources/Provisioning/GPU/gpu_deploy.sh | 33 +++++-------------- 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index 4d71d39d3..1751882ce 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -41,11 +41,11 @@ function wait_until_gpu_pods_are_running() { } function rerun_accelerator_migration() { -#As we are adding the GPUs after installing the operator, those GPUs are not discovered automatically. -#In order to rerun the migration we need to -#1. Delete the migration configmap -#2. Delete the dashboard replicaset to trigger new pods -#Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938 +# As we are adding the GPUs after installing the RHODS operator, those GPUs are not discovered automatically. +# In order to rerun the migration we need to +# 1. Delete the migration configmap +# 2. Delete one of the dashboard pods, so the configmap is created again and the migration run again +# Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938 local timeout_seconds=600 local sleep_time=5 @@ -57,29 +57,14 @@ function rerun_accelerator_migration() { return 1 fi - dashboard_rs=$(oc get rs -n redhat-ods-applications | grep rhods-dashboard- | awk '{print $1;exit}') - echo "Deleting ReplicaSet $dashboard_rs" - if ! oc delete rs "$dashboard_rs" -n redhat-ods-applications; + dashboard_pod=$(oc get po -n redhat-ods-applications | grep rhods-dashboard- | awk '{print $1;exit}') + echo "Deleting pod $dashboard_pod" + if ! oc delete po "$dashboard_pod" -n redhat-ods-applications; then - printf "ERROR: When trying to delete the dashboard replica set\n" + printf "ERROR: When trying to delete Pod $dashboard_pod \n" return 1 fi - # Wait until all dashboard pods are ready again - SECONDS=0 - while [ "$SECONDS" -le "$timeout_seconds" ]; do - dashboard_pods=$(oc get deployment rhods-dashboard -n redhat-ods-applications | grep rhods-dashboard | awk '{print $2;exit}') - dashboard_pods_total=$(echo "$dashboard_pods" | cut -c3-3) - dashboard_pods_avail=$(echo "$dashboard_pods" | cut -c1-1) - ((remaining_seconds = timeout_seconds - SECONDS)) - echo "Dashboard pods: Available $dashboard_pods_avail out of $dashboard_pods_total ... (timeout in $remaining_seconds seconds)" - if [ "$dashboard_pods_avail" == "$dashboard_pods_total" ]; then - break - else - sleep $sleep_time - ((SECONDS+=$sleep_time)) - fi - done } wait_until_gpu_pods_are_running From 4cc8da492c6a35fe16f15535ced477c112587a19 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Wed, 25 Oct 2023 12:34:53 +0200 Subject: [PATCH 15/19] Pod deletion more clean and smart --- ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index 1751882ce..19ad47ab1 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -57,9 +57,9 @@ function rerun_accelerator_migration() { return 1 fi - dashboard_pod=$(oc get po -n redhat-ods-applications | grep rhods-dashboard- | awk '{print $1;exit}') + dashboard_pod=$(oc get po -n redhat-ods-applications -l app=rhods-dashboard -o name | head -n1) echo "Deleting pod $dashboard_pod" - if ! oc delete po "$dashboard_pod" -n redhat-ods-applications; + if ! oc delete "$dashboard_pod" -n redhat-ods-applications; then printf "ERROR: When trying to delete Pod $dashboard_pod \n" return 1 From b1fe64c1c0ac6d6e787b92a795db09ae3d723e00 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Wed, 25 Oct 2023 12:42:20 +0200 Subject: [PATCH 16/19] modify delete dashboard error message --- ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index 19ad47ab1..cc397f60d 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -61,7 +61,7 @@ function rerun_accelerator_migration() { echo "Deleting pod $dashboard_pod" if ! oc delete "$dashboard_pod" -n redhat-ods-applications; then - printf "ERROR: When trying to delete Pod $dashboard_pod \n" + printf "ERROR: When trying to delete Dashboard Pod\n" return 1 fi From 9c8746a1ac41ffd79423034d5fb70b35d86a117a Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Wed, 25 Oct 2023 15:34:10 +0200 Subject: [PATCH 17/19] Rollback restart instead of pod deletion --- ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index cc397f60d..4af313238 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -44,7 +44,7 @@ function rerun_accelerator_migration() { # As we are adding the GPUs after installing the RHODS operator, those GPUs are not discovered automatically. # In order to rerun the migration we need to # 1. Delete the migration configmap -# 2. Delete one of the dashboard pods, so the configmap is created again and the migration run again +# 2. Rollout restart dashboard deployment, so the configmap is created again and the migration run again # Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938 local timeout_seconds=600 @@ -57,11 +57,10 @@ function rerun_accelerator_migration() { return 1 fi - dashboard_pod=$(oc get po -n redhat-ods-applications -l app=rhods-dashboard -o name | head -n1) - echo "Deleting pod $dashboard_pod" - if ! oc delete "$dashboard_pod" -n redhat-ods-applications; + echo "Rollout restart rhods-dashboard deployment" + if ! oc rollout restart deployment.apps/rhods-dashboard -n redhat-ods-applications; then - printf "ERROR: When trying to delete Dashboard Pod\n" + printf "ERROR: When trying to rollout restart rhods-dashboard deployment\n" return 1 fi From 23c43ff671e3e8cf4448bf792ce662cb09433d02 Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Thu, 26 Oct 2023 14:45:43 +0200 Subject: [PATCH 18/19] fix typo in variable in workbenches --- .../Resources/Provisioning/GPU/gpu_deploy.sh | 84 +++++++++---------- .../Workbenches.resource | 4 +- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index 4af313238..d438ecb3d 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -1,44 +1,44 @@ #!/bin/bash # Make changes to gpu install file -GPU_INSTALL_DIR="$(dirname "$0")" - -CHANNEL=$(oc get packagemanifest gpu-operator-certified -n openshift-marketplace -o jsonpath='{.status.defaultChannel}') - -CSVNAME=$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -ojson | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV') - -sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" ${GPU_INSTALL_DIR}/gpu_install.yaml - -oc apply -f ${GPU_INSTALL_DIR}/gpu_install.yaml - -function wait_until_gpu_pods_are_running() { - - local timeout_seconds=1200 - local sleep_time=90 - - echo "Waiting until gpu pods are in running state..." - - SECONDS=0 - while [ "$SECONDS" -le "$timeout_seconds" ]; do - pod_status=$(oc get pods -n "nvidia-gpu-operator" | grep gpu-operator | awk 'NR == 1 { print $3 }') - if [ "$pod_status" == "Running" ]; then - break - else - ((remaining_seconds = timeout_seconds - SECONDS)) - echo "GPU installation seems to be still running (timeout in $remaining_seconds seconds)..." - sleep $sleep_time - fi - done - - if [ "$pod_status" == "Running" ]; then - printf "GPU operator is up and running\n" - return 0 - else - printf "ERROR: Timeout reached while waiting for gpu operator to be in running state\n" - return 1 - fi - -} +#GPU_INSTALL_DIR="$(dirname "$0")" +# +#CHANNEL=$(oc get packagemanifest gpu-operator-certified -n openshift-marketplace -o jsonpath='{.status.defaultChannel}') +# +#CSVNAME=$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -ojson | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV') +# +#sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" ${GPU_INSTALL_DIR}/gpu_install.yaml +# +#oc apply -f ${GPU_INSTALL_DIR}/gpu_install.yaml +# +#function wait_until_gpu_pods_are_running() { +# +# local timeout_seconds=1200 +# local sleep_time=90 +# +# echo "Waiting until gpu pods are in running state..." +# +# SECONDS=0 +# while [ "$SECONDS" -le "$timeout_seconds" ]; do +# pod_status=$(oc get pods -n "nvidia-gpu-operator" | grep gpu-operator | awk 'NR == 1 { print $3 }') +# if [ "$pod_status" == "Running" ]; then +# break +# else +# ((remaining_seconds = timeout_seconds - SECONDS)) +# echo "GPU installation seems to be still running (timeout in $remaining_seconds seconds)..." +# sleep $sleep_time +# fi +# done +# +# if [ "$pod_status" == "Running" ]; then +# printf "GPU operator is up and running\n" +# return 0 +# else +# printf "ERROR: Timeout reached while waiting for gpu operator to be in running state\n" +# return 1 +# fi +# +#} function rerun_accelerator_migration() { # As we are adding the GPUs after installing the RHODS operator, those GPUs are not discovered automatically. @@ -66,10 +66,10 @@ function rerun_accelerator_migration() { } -wait_until_gpu_pods_are_running -oc apply -f ${GPU_INSTALL_DIR}/nfd_deploy.yaml -oc get csv -n nvidia-gpu-operator $CSVNAME -ojsonpath={.metadata.annotations.alm-examples} | jq .[0] > clusterpolicy.json -oc apply -f clusterpolicy.json +#wait_until_gpu_pods_are_running +#oc apply -f ${GPU_INSTALL_DIR}/nfd_deploy.yaml +#oc get csv -n nvidia-gpu-operator $CSVNAME -ojsonpath={.metadata.annotations.alm-examples} | jq .[0] > clusterpolicy.json +#oc apply -f clusterpolicy.json rerun_accelerator_migration diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource index 130b286f7..393d1cad3 100644 --- a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource +++ b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Workbenches.resource @@ -18,7 +18,7 @@ ${WORKBENCH_SIZE_MENU_BTN_XP}= xpath=//section[@id="deployment-size"]/ ${WORKBENCH_SIZE_SIDE_MENU_BTN}= xpath=//nav[@aria-label="Jump to section"]//span[text()="Deployment size"] ${WORKBENCH_ACCELERATOR_DROPDOWN_XPATH}= xpath=//label[@for='modal-notebook-accelerator']/ancestor::div[@class='pf-c-form__group']/descendant::button ${WORKBENCH_ACCELERATOR_INPUT_XPATH}= xpath=//input[@aria-label='Number of accelerators'] -${WORKBENCH_ACCELERATOR_LESS_BUTTON_XPATH}= xpath=${WORKBENCH_INPUT_XPATH}/preceding-sibling::button +${WORKBENCH_ACCELERATOR_LESS_BUTTON_XPATH}= xpath=${WORKBENCH_ACCELERATOR_INPUT_XPATH}/preceding-sibling::button ${WORKBENCH_ACCELERATOR_PLUS_BUTTON_XPATH}= xpath=${WORKBENCH_ACCELERATOR_INPUT_XPATH}/following-sibling::button ${WORKBENCH_SIZE_ITEM_BTN_XP}= xpath=//ul[@data-id="container-size-select"]/li/button ${WORKBENCH_GPU_MENU_BTN_XP}= xpath=//section[@id="deployment-size"]//button[contains(@aria-labelledby,"gpu-numbers")] # robocop: disable @@ -346,7 +346,7 @@ Handle Stop Workbench Confirmation Modal Run Keyword And Continue On Failure ... Page Should Contain Are you sure you want to stop the workbench? Any changes without saving will be erased. Run Keyword And Continue On Failure Page Should Contain To save changes, access your - Run Keyword And Continue On Failure Page Should Contain Element xpath=//a[.="workbench"] + Run Keyword And Continue On Failure Page Should Contain Element xpath=//a[.="workbench"] END Run Keyword And Continue On Failure Page Should Contain Element xpath=//input[@id="dont-show-again"] Run Keyword And Continue On Failure Click Element xpath=//input[@id="dont-show-again"] From f9577ca3be559fdf0149523d23a08606d20dae9b Mon Sep 17 00:00:00 2001 From: Fede Alonso Date: Thu, 26 Oct 2023 21:25:48 +0200 Subject: [PATCH 19/19] Delete unused variables in gpu_deploy script --- .../Resources/Provisioning/GPU/gpu_deploy.sh | 87 +++++++++---------- 1 file changed, 42 insertions(+), 45 deletions(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index d438ecb3d..655d5db14 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -1,44 +1,44 @@ #!/bin/bash # Make changes to gpu install file -#GPU_INSTALL_DIR="$(dirname "$0")" -# -#CHANNEL=$(oc get packagemanifest gpu-operator-certified -n openshift-marketplace -o jsonpath='{.status.defaultChannel}') -# -#CSVNAME=$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -ojson | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV') -# -#sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" ${GPU_INSTALL_DIR}/gpu_install.yaml -# -#oc apply -f ${GPU_INSTALL_DIR}/gpu_install.yaml -# -#function wait_until_gpu_pods_are_running() { -# -# local timeout_seconds=1200 -# local sleep_time=90 -# -# echo "Waiting until gpu pods are in running state..." -# -# SECONDS=0 -# while [ "$SECONDS" -le "$timeout_seconds" ]; do -# pod_status=$(oc get pods -n "nvidia-gpu-operator" | grep gpu-operator | awk 'NR == 1 { print $3 }') -# if [ "$pod_status" == "Running" ]; then -# break -# else -# ((remaining_seconds = timeout_seconds - SECONDS)) -# echo "GPU installation seems to be still running (timeout in $remaining_seconds seconds)..." -# sleep $sleep_time -# fi -# done -# -# if [ "$pod_status" == "Running" ]; then -# printf "GPU operator is up and running\n" -# return 0 -# else -# printf "ERROR: Timeout reached while waiting for gpu operator to be in running state\n" -# return 1 -# fi -# -#} +GPU_INSTALL_DIR="$(dirname "$0")" + +CHANNEL=$(oc get packagemanifest gpu-operator-certified -n openshift-marketplace -o jsonpath='{.status.defaultChannel}') + +CSVNAME=$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -ojson | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV') + +sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" ${GPU_INSTALL_DIR}/gpu_install.yaml + +oc apply -f ${GPU_INSTALL_DIR}/gpu_install.yaml + +function wait_until_gpu_pods_are_running() { + + local timeout_seconds=1200 + local sleep_time=90 + + echo "Waiting until gpu pods are in running state..." + + SECONDS=0 + while [ "$SECONDS" -le "$timeout_seconds" ]; do + pod_status=$(oc get pods -n "nvidia-gpu-operator" | grep gpu-operator | awk 'NR == 1 { print $3 }') + if [ "$pod_status" == "Running" ]; then + break + else + ((remaining_seconds = timeout_seconds - SECONDS)) + echo "GPU installation seems to be still running (timeout in $remaining_seconds seconds)..." + sleep $sleep_time + fi + done + + if [ "$pod_status" == "Running" ]; then + printf "GPU operator is up and running\n" + return 0 + else + printf "ERROR: Timeout reached while waiting for gpu operator to be in running state\n" + return 1 + fi + +} function rerun_accelerator_migration() { # As we are adding the GPUs after installing the RHODS operator, those GPUs are not discovered automatically. @@ -47,9 +47,6 @@ function rerun_accelerator_migration() { # 2. Rollout restart dashboard deployment, so the configmap is created again and the migration run again # Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938 - local timeout_seconds=600 - local sleep_time=5 - echo "Deleting configmap migration-gpu-status" if ! oc delete configmap migration-gpu-status -n redhat-ods-applications; then @@ -66,10 +63,10 @@ function rerun_accelerator_migration() { } -#wait_until_gpu_pods_are_running -#oc apply -f ${GPU_INSTALL_DIR}/nfd_deploy.yaml -#oc get csv -n nvidia-gpu-operator $CSVNAME -ojsonpath={.metadata.annotations.alm-examples} | jq .[0] > clusterpolicy.json -#oc apply -f clusterpolicy.json +wait_until_gpu_pods_are_running +oc apply -f ${GPU_INSTALL_DIR}/nfd_deploy.yaml +oc get csv -n nvidia-gpu-operator $CSVNAME -ojsonpath={.metadata.annotations.alm-examples} | jq .[0] > clusterpolicy.json +oc apply -f clusterpolicy.json rerun_accelerator_migration