Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport "Update AMD Operator and NFD install scripts (#2139)" in releases/2.10.0 #2171

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,32 @@ EOF
fi
}

function applyWorkaroundForOlderOCPVersions () {
# workaround for OCP versions less than 4.16
# AMD certified operator is published starting from OCP v4.16
ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"')
IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion"
if [ "${ocpVersionSplit[1]}" -lt 16 ]; then
echo "OCP Version: $ocpVersion"
echo "AMD Operator is not available for versions < 4.16, hence creating custom catalog source as workaround"
oc apply -f - <<EOF
apiVersion: operators.coreos.com/v1alpha1
kind: CatalogSource
metadata:
name: certified-operators-416-amd
namespace: openshift-marketplace
spec:
displayName: Certfied operator
image: 'registry.redhat.io/redhat/certified-operator-index:v4.16'
publisher: RHOAI QE
sourceType: grpc
EOF
oc wait --timeout="120s" --for=condition=ready=true pod -n openshift-marketplace -l olm.catalogSource=certified-operators-416-amd
sed -i'' -e "s/certified-operators/certified-operators-416-amd/g" "$GPU_INSTALL_DIR/amd_gpu_install.yaml"
fi
}

applyWorkaroundForOlderOCPVersions
check_registry
status=$?

Expand All @@ -170,10 +196,7 @@ fi
sleep 120
wait_while 1800 ! machineconfig_updates

echo "Installing NFD operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
wait_while 360 ! has_csv_succeeded openshift-nfd nfd
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
/bin/bash tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh
echo "Installing KMM operator"
oc apply -f "$GPU_INSTALL_DIR/kmm_operator_install.yaml"
wait_while 360 ! has_csv_succeeded openshift-kmm kernel-module-management
Expand Down
29 changes: 29 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
set -e

NFD_INSTALL_DIR="$(dirname "$0")"
NFD_INSTANCE=$NFD_INSTALL_DIR/nfd_deploy.yaml
echo "Installing NFD operator"
oc apply -f "$NFD_INSTALL_DIR/nfd_operator.yaml"
oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd

ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"')
IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion"
xyVersion="${ocpVersionSplit[0]}.${ocpVersionSplit[1]}"
declare -A images=(
["4.14"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery@sha256:2977e67a413882efbfb90b52facf65d38a5cb2cd7a232ca3a69476e5dec33319"
["4.15"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:661b6697dee34626a3a98b50cdba787402ab214d2807b8460df92e3c79cdfcc5"
["4.16"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:bb95bc317ab78e8af4ef34dd66f9f62c2f8c261dfb5eab40918142812802f8b7"
["4.17"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:154cf3f1ddaf895d7ecd04947bd455a930132f72acc6e8bde8c26bc123184ace"
# 4.18 is a pre-release image. We need to update it later
["4.18"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:510cb4351253492455664b6c323f54dc2f6f2f8791c5e92ba6b7e60b8adb357c"
)
if [ "${images[$xyVersion]}" ]; then
imageUrl="${images[$xyVersion]}"
echo "Using image SHA for $xyVersion: $imageUrl"
else
imageUrl="${images["4.17"]}"
echo "WARNING: I don't know the sha for $xyVersion. Re-using default 4.17 $imageUrl. It might not work!"
fi
sed -i'' -e "s/<imageUrl>/$imageUrl/g" $NFD_INSTANCE
oc apply -f "$NFD_INSTANCE"
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ spec:
instance: "" # instance is empty by default
topologyupdater: false # False by default
operand:
image: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11
# Image URL example: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11
image: <imageUrl>
imagePullPolicy: Always
workerConfig:
configData: |
Expand Down
6 changes: 2 additions & 4 deletions ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@ CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketpla
sed -i'' -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" "$GPU_INSTALL_DIR/gpu_install.yaml"

oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml"
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"
/bin/bash tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd
echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified

Expand Down Expand Up @@ -80,7 +79,6 @@ function rerun_accelerator_migration() {
}

wait_until_pod_ready_status "gpu-operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
Expand Down
Loading