From 546b90a1e2186b796c0ded9bb8664189a4622943 Mon Sep 17 00:00:00 2001 From: Berto D'Attoma <88311595+bdattoma@users.noreply.github.com> Date: Fri, 10 Jan 2025 10:22:41 +0100 Subject: [PATCH] Update AMD Operator and NFD install scripts (#2139) * workaround for amd certified operator in ocp < 4.16 * add NFD installation script and use it in AMD script * use NFD install script in NVIDIA script * minor change * update warn msg * rm unused function --- .../Provisioning/GPU/AMD/amd_operator.sh | 31 ++++++++++++++++--- .../Provisioning/GPU/NFD/install_nfd.sh | 29 +++++++++++++++++ .../GPU/{ => NFD}/nfd_deploy.yaml | 4 +-- .../GPU/{ => NFD}/nfd_operator.yaml | 0 .../Provisioning/GPU/NVIDIA/gpu_deploy.sh | 6 ++-- 5 files changed, 60 insertions(+), 10 deletions(-) create mode 100644 ods_ci/tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh rename ods_ci/tasks/Resources/Provisioning/GPU/{ => NFD}/nfd_deploy.yaml (85%) rename ods_ci/tasks/Resources/Provisioning/GPU/{ => NFD}/nfd_operator.yaml (100%) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh index 1f5c2e95c..fd38ff64c 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh @@ -164,6 +164,32 @@ EOF fi } +function applyWorkaroundForOlderOCPVersions () { + # workaround for OCP versions less than 4.16 + # AMD certified operator is published starting from OCP v4.16 + ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"') + IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion" + if [ "${ocpVersionSplit[1]}" -lt 16 ]; then + echo "OCP Version: $ocpVersion" + echo "AMD Operator is not available for versions < 4.16, hence creating custom catalog source as workaround" + oc apply -f - </$imageUrl/g" $NFD_INSTANCE +oc apply -f "$NFD_INSTANCE" diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml similarity index 85% rename from ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml rename to ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml index 113980150..ffa3dd1f9 100644 --- a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml +++ b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml @@ -7,8 +7,8 @@ spec: instance: "" # instance is empty by default topologyupdater: false # False by default operand: - # Image digest for registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11 - image: registry.redhat.io/openshift4/ose-node-feature-discovery@sha256:d6242132d2ddec00c46d22b63015a33af821eace0150ba47d185cd992fee317d + # Image URL example: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11 + image: imagePullPolicy: Always workerConfig: configData: | diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_operator.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_operator.yaml similarity index 100% rename from ods_ci/tasks/Resources/Provisioning/GPU/nfd_operator.yaml rename to ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_operator.yaml diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh index 56934125a..3a53f327e 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh @@ -12,10 +12,9 @@ CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketpla sed -i'' -e "0,/v1.11/s//$CHANNEL/g" "$GPU_INSTALL_DIR/gpu_install.yaml" oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml" -oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml" -echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete" +/bin/bash tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh -oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd +echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete" oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified @@ -88,7 +87,6 @@ function rerun_accelerator_migration() { } wait_until_pod_ready_status "gpu-operator" -oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml" oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json oc apply -f clusterpolicy.json wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600