From a3e9b99f14e8c9c8e514d1fdec6cc015fed05a29 Mon Sep 17 00:00:00 2001 From: Berto D'Attoma <88311595+bdattoma@users.noreply.github.com> Date: Fri, 10 Jan 2025 10:22:41 +0100 Subject: [PATCH 1/2] Update AMD Operator and NFD install scripts (#2139) * workaround for amd certified operator in ocp < 4.16 * add NFD installation script and use it in AMD script * use NFD install script in NVIDIA script * minor change * update warn msg * rm unused function --- .../Provisioning/GPU/AMD/amd_operator.sh | 31 ++++++++++++++++--- .../Provisioning/GPU/NFD/install_nfd.sh | 29 +++++++++++++++++ .../GPU/{ => NFD}/nfd_deploy.yaml | 5 +++ .../GPU/{ => NFD}/nfd_operator.yaml | 0 .../Provisioning/GPU/NVIDIA/gpu_deploy.sh | 6 ++-- 5 files changed, 63 insertions(+), 8 deletions(-) create mode 100644 ods_ci/tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh rename ods_ci/tasks/Resources/Provisioning/GPU/{ => NFD}/nfd_deploy.yaml (81%) rename ods_ci/tasks/Resources/Provisioning/GPU/{ => NFD}/nfd_operator.yaml (100%) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh index 4ff4b4d88..aa27ef97b 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh @@ -157,6 +157,32 @@ EOF fi } +function applyWorkaroundForOlderOCPVersions () { + # workaround for OCP versions less than 4.16 + # AMD certified operator is published starting from OCP v4.16 + ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"') + IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion" + if [ "${ocpVersionSplit[1]}" -lt 16 ]; then + echo "OCP Version: $ocpVersion" + echo "AMD Operator is not available for versions < 4.16, hence creating custom catalog source as workaround" + oc apply -f - </$imageUrl/g" $NFD_INSTANCE +oc apply -f "$NFD_INSTANCE" diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml similarity index 81% rename from ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml rename to ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml index 4cb56c3af..aaf1408c5 100644 --- a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml +++ b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml @@ -7,7 +7,12 @@ spec: instance: "" # instance is empty by default topologyupdater: false # False by default operand: +<<<<<<< HEAD:ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml image: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11 +======= + # Image URL example: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11 + image: +>>>>>>> a0052546 (Update AMD Operator and NFD install scripts (#2139)):ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml imagePullPolicy: Always workerConfig: configData: | diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_operator.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_operator.yaml similarity index 100% rename from ods_ci/tasks/Resources/Provisioning/GPU/nfd_operator.yaml rename to ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_operator.yaml diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh index b45728d97..45c4731bf 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh @@ -12,10 +12,9 @@ CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketpla sed -i'' -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" "$GPU_INSTALL_DIR/gpu_install.yaml" oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml" -oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml" -echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete" +/bin/bash tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh -oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd +echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete" oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified @@ -80,7 +79,6 @@ function rerun_accelerator_migration() { } wait_until_pod_ready_status "gpu-operator" -oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml" oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json oc apply -f clusterpolicy.json wait_until_pod_ready_status "nvidia-device-plugin-daemonset" From 0e43c44b470e0f2ae74e8968125501b9025a14da Mon Sep 17 00:00:00 2001 From: bdattoma Date: Fri, 10 Jan 2025 12:37:26 +0100 Subject: [PATCH 2/2] fix conflict --- ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml index aaf1408c5..ffa3dd1f9 100644 --- a/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml +++ b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml @@ -7,12 +7,8 @@ spec: instance: "" # instance is empty by default topologyupdater: false # False by default operand: -<<<<<<< HEAD:ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml - image: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11 -======= # Image URL example: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11 image: ->>>>>>> a0052546 (Update AMD Operator and NFD install scripts (#2139)):ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml imagePullPolicy: Always workerConfig: configData: |