Skip to content

Commit

Permalink
- Update arti paths for USS
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrence Mitchem authored and jswaro committed Nov 14, 2024
1 parent 210a853 commit 1e3c64d
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 105 deletions.
21 changes: 12 additions & 9 deletions contrib/cray/cray_rpmbuild.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ function move_rpms() {
fi

# Move the RPMs and SRPMS to where the "Publish" stage expects to find them
mv `find rpmbuild/RPMS | grep rpm$` `find rpmbuild/SRPMS | grep rpm$` RPMS
cp `find rpmbuild/RPMS | grep rpm$` `find rpmbuild/SRPMS | grep rpm$` RPMS
chmod a+rX -R RPMS
}

Expand Down Expand Up @@ -55,20 +55,23 @@ configure_options="LDFLAGS=-Wl,--build-id --enable-only --enable-restricted-dl"

rpmbuilddir=$PWD/rpmbuild

if [[ "${TARGET_OS}" == sle* || "${TARGET_OS}" == rhel* ]]; then
with_cuda=0
with_rocm=0

if [[ "${TARGET_ARCH}" == x86_64 ]]; then
ROCM_CONFIG="--with-rocr=/opt/rocm --enable-rocr-dlopen"
else
ROCM_CONFIG=""
fi
if rpm -q cuda-drivers; then
with_cuda=1
CUDA_CONFIG="--with-cuda=/usr/local/cuda --enable-cuda-dlopen"
GDRCOPY_CONFIG="--enable-gdrcopy-dlopen"
else
CUDA_CONFIG=""
GDRCOPY_CONFIG=""
fi

if rpm -q amdgpu-dkms; then
with_rocm=1
ROCM_CONFIG="--with-rocr=/opt/rocm --enable-rocr-dlopen"
else
ROCM_CONFIG=""
CUDA_CONFIG=""
GDRCOPY_CONFIG=""
fi

if [[ ( ${TARGET_OS} == sle15_sp4* || ${TARGET_OS} == sle15_sp5* ) \
Expand Down
219 changes: 123 additions & 96 deletions contrib/cray/runBuildPrep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,36 @@ fi

TARGET_OS_SHORT=$(echo $TARGET_OS | sed -E 's/(_cn|_ncn)$//')

CNE_BRANCH=""
OSNAME_SHORT="invalid"

case "${OBS_TARGET_OS}" in
sle15_sp4_*) COS_BRANCH='release/cos-2.5' ;;
cos_2_5_*) COS_BRANCH='release/cos-2.5' ;;
csm_1_4_*) COS_BRANCH='release/cos-2.5' ;;
cos_3_1_*) COS_BRANCH='release/uss-1.1' ;;
cos_3_2_*) COS_BRANCH='release/uss-1.2' ;;
csm_1_5_*) COS_BRANCH='release/uss-1.1' ;;
csm_1_6_*) COS_BRANCH='release/uss-1.2' ;;
sle15_sp5_*) COS_BRANCH='release/uss-1.1' ;;
sle15_sp6_*) COS_BRANCH='release/uss-1.2' ;;
sle15_sp4_*) COS_BRANCH='release/cos-2.5'
OSNAME_SHORT="sle15_sp4_cn"
;;
cos_2_5_*) COS_BRANCH='release/cos-2.5'
OSNAME_SHORT="sle15_sp4_cn";
;;
csm_1_4_*) COS_BRANCH='release/cos-2.5'
OSNAME_SHORT="sle15_sp4_cn"
;;
cos_3_1_*) COS_BRANCH='release/uss-1.1'
OSNAME_SHORT="sle15_sp5"
;;
cos_3_2_*) COS_BRANCH='release/uss-1.2'
OSNAME_SHORT="sle15_sp6"
;;
csm_1_5_*) COS_BRANCH='release/uss-1.1'
OSNAME_SHORT="sle15_sp5"
;;
csm_1_6_*) COS_BRANCH='release/uss-1.2'
OSNAME_SHORT="sle15_sp6"
;;
sle15_sp5_*) COS_BRANCH='release/uss-1.1'
OSNAME_SHORT="sle15_sp5";
;;
sle15_sp6_*) COS_BRANCH='release/uss-1.2'
OSNAME_SHORT="sle15_sp6"
;;
*) COS_BRANCH='dev/master' ;;
esac

Expand Down Expand Up @@ -224,6 +242,15 @@ function install_gdrcopy() {

install_gdrcopy

if rpm -q cuda-drivers; then
with_cuda=1
fi

if rpm -q amdgpu-dkms; then
with_rocm=1
fi


if command -v yum > /dev/null; then

yum-config-manager --add-repo=${ARTI_URL}/${PRODUCT}-${ARTI_LOCATION}/${ARTI_BRANCH}/${TARGET_OS}/
Expand All @@ -232,12 +259,6 @@ if command -v yum > /dev/null; then
if [ $OS_TYPE = "rhel" ] && \
[[ $RHEL_GPU_SUPPORTED_VERSIONS = *$OS_VERSION* ]]; then

if [[ ${TARGET_ARCH} == x86_64 ]]; then
with_rocm=1
fi

with_cuda=1

case $OS_VERSION in
8.8)
ROCM_VERSION="5.7"
Expand Down Expand Up @@ -292,22 +313,14 @@ if command -v yum > /dev/null; then
yum install -y $RPMS

elif command -v zypper > /dev/null; then
with_cuda=1

if [[ ${TARGET_ARCH} == x86_64 ]]; then
with_rocm=1
fi

case "${OBS_TARGET_OS}" in
sle15_sp4_*) CUDA_RPMS="nvhpc-2023"
;;
cos_2_5_*) CUDA_RPMS="nvhpc-2023"
;;
csm_1_4_*) CUDA_RPMS="nvhpc-2023"
;;
csm_1_5_*) CUDA_RPMS="nvhpc"
;;
csm_1_6_*) CUDA_RPMS="nvhpc"
csm_1_*) with_cuda=0
with_rocm=0
;;
cos_3_1_*) CUDA_RPMS="nvhpc"
;;
Expand All @@ -330,11 +343,11 @@ elif command -v zypper > /dev/null; then
if [[ "${COS_BRANCH}" == release/uss-* ]]; then
zypper --verbose --non-interactive addrepo --no-gpgcheck --check \
--priority 20 --name=cuda \
${ARTI_URL}/uss-internal-third-party-rpm-local/nvidia_hpc_sdk/${COS_BRANCH}/${TARGET_OS_SHORT}/ cuda
${ARTI_URL}/uss-internal-third-party-rpm-local/nvidia_hpc_sdk/${COS_BRANCH}/${OSNAME_SHORT}/ cuda
else
zypper --verbose --non-interactive addrepo --no-gpgcheck --check \
--priority 20 --name=cuda \
${ARTI_URL}/cos-internal-third-party-generic-local/nvidia_hpc_sdk/${TARGET_OS}/${TARGET_ARCH}/${COS_BRANCH}/ cuda
${ARTI_URL}/cos-internal-third-party-generic-local/nvidia_hpc_sdk/${OSNAME_SHORT}/${TARGET_ARCH}/${COS_BRANCH}/ cuda
fi

if [[ ! -v SHS_NEW_BUILD_SYSTEM ]]; then
Expand All @@ -346,13 +359,16 @@ elif command -v zypper > /dev/null; then
if [[ "${COS_BRANCH}" == release/uss-* ]]; then
zypper --verbose --non-interactive addrepo --no-gpgcheck --check \
--priority 20 --name=rocm \
${ARTI_URL}/uss-internal-third-party-rpm-local/rocm/${COS_BRANCH}/${TARGET_OS_SHORT}/ rocm
${ARTI_URL}/uss-internal-third-party-rpm-local/rocm/${COS_BRANCH}/${OSNAME_SHORT}/ rocm
else
zypper --verbose --non-interactive addrepo --no-gpgcheck --check \
--priority 20 --name=rocm \
${ARTI_URL}/cos-internal-third-party-generic-local/rocm/latest/${TARGET_OS}/${TARGET_ARCH}/${COS_BRANCH}/ rocm
${ARTI_URL}/cos-internal-third-party-generic-local/rocm/latest/${OSNAME_SHORT}/${TARGET_ARCH}/${COS_BRANCH}/ rocm
fi

if [[ ! -v SHS_NEW_BUILD_SYSTEM ]]; then
RPMS+=" ${ROCR_RPMS} "
fi
RPMS+=" ${ROCR_RPMS} "
fi

if [[ $with_ze -eq 1 ]]; then
Expand Down Expand Up @@ -380,82 +396,93 @@ fi

set -x

if [[ $with_cuda -eq 1 ]]; then
function gpu_config_nvidia() {
nvhpc_sdk_version="not supported"
rocm_version="not supported"

if [[ $with_cuda -eq 1 ]]; then

# Specify the directory where you want to search for folders
search_dir="/opt/nvidia/hpc_sdk/Linux_${TARGET_ARCH}"
# Specify the directory where you want to search for folders
search_dir="/opt/nvidia/hpc_sdk/Linux_${TARGET_ARCH}"

# Define a pattern to match folders in the "x.y" format
pattern='^[0-9]+\.[0-9]+$'
# Define a pattern to match folders in the "x.y" format
pattern='^[0-9]+\.[0-9]+$'

# Initialize variables to keep track of the latest folder and its version
latest_version=""
latest_folder=""
# Initialize variables to keep track of the latest folder and its version
latest_version=""
latest_folder=""

# Iterate through the directories in the search directory
for dir in "$search_dir"/*; do
if [[ -d "$dir" && $(basename "$dir") =~ $pattern ]]; then
version="$(basename "$dir")"
if [[ -z "$latest_version" || "$version" > "$latest_version" ]]; then
latest_version="$version"
latest_folder="$dir"
# Iterate through the directories in the search directory
for dir in "$search_dir"/*; do
if [[ -d "$dir" && $(basename "$dir") =~ $pattern ]]; then
version="$(basename "$dir")"
if [[ -z "$latest_version" || "$version" > "$latest_version" ]]; then
latest_version="$version"
latest_folder="$dir"
fi
fi
fi
done

# Check if any matching folders were found
if [ -n "$latest_folder" ]; then
nvhpc_sdk_version="$latest_version"
echo "Using $nvhpc_sdk_version at $latest_folder"
nvhpc_cuda_path=$latest_folder/cuda
echo "Using $nvhpc_sdk_version at $nvhpc_cuda_path"

# Convenient symlink which allows the libfabric build process to not
# have to call out a specific versioned CUDA directory.
ln -s $nvhpc_cuda_path /usr/local/cuda

# The CUDA device driver RPM provides a usable libcuda.so which is
# required by the libfabric autoconf checks. Since artifactory does not
# provide this RPM, the cuda-driver-devel-11-0 RPM is installed and
# provides a stub libcuda.so. But, this stub libcuda.so is installed
# into a non-lib path. A symlink is created to fix this.
ln -s /usr/local/cuda/lib64/stubs/libcuda.so \
/usr/local/cuda/lib64/libcuda.so
done

# Check if any matching folders were found
if [ -n "$latest_folder" ]; then
nvhpc_sdk_version="$latest_version"
echo "Using $nvhpc_sdk_version at $latest_folder"
nvhpc_cuda_path=$latest_folder/cuda
echo "Using $nvhpc_sdk_version at $nvhpc_cuda_path"

# Convenient symlink which allows the libfabric build process to not
# have to call out a specific versioned CUDA directory.
ln -s $nvhpc_cuda_path /usr/local/cuda

# The CUDA device driver RPM provides a usable libcuda.so which is
# required by the libfabric autoconf checks. Since artifactory does not
# provide this RPM, the cuda-driver-devel-11-0 RPM is installed and
# provides a stub libcuda.so. But, this stub libcuda.so is installed
# into a non-lib path. A symlink is created to fix this.
ln -s /usr/local/cuda/lib64/stubs/libcuda.so \
/usr/local/cuda/lib64/libcuda.so

else
echo "No matching CUDA folders found."
exit 1
else
echo "No matching CUDA folders found."
exit 1
fi
fi
fi
}

if [[ $with_rocm -eq 1 ]]; then
update-alternatives --display rocm

# Find the ROCm version directory in /opt/
rocm_version_dir=$(ls -d /opt/rocm-* 2>/dev/null)

# Check if a ROCm version directory was found
if [ -n "$rocm_version_dir" ]; then
# Extract the version from the directory path
rocm_version=$(basename "$rocm_version_dir")

# Check if the version follows the expected format
if [[ $rocm_version =~ ^rocm-[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "ROCm version: $rocm_version"
ln -s /opt/"$rocm_version" /opt/rocm
function gpu_config_rocm() {
if [[ $with_rocm -eq 1 ]]; then
update-alternatives --display rocm

# Find the ROCm version directory in /opt/
rocm_version_dir=$(ls -d /opt/rocm-* 2>/dev/null)

# Check if a ROCm version directory was found
if [ -n "$rocm_version_dir" ]; then
# Extract the version from the directory path
rocm_version=$(basename "$rocm_version_dir")

# Check if the version follows the expected format
if [[ $rocm_version =~ ^rocm-[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "ROCm version: $rocm_version"
ln -s /opt/"$rocm_version" /opt/rocm
else
echo "Unexpected directory structure found: $rocm_version"
exit 1
fi
else
echo "Unexpected directory structure found: $rocm_version"
echo "The installation of ROCm is not found in the /opt/ directory."
exit 1
fi
else
echo "The installation of ROCm is not found in the /opt/ directory."
exit 1
rocm_version="not-found"
fi
else
rocm_version="not-found"
fi

echo "ROCm Version: ${rocm_version}" > /var/tmp/gpu-versions
echo "Nvidia Version: ${nvhpc_sdk_version}" >> /var/tmp/gpu-versions
echo "GPU Versions File:"
echo "$(</var/tmp/gpu-versions)"
echo "ROCm Version: ${rocm_version}" > /var/tmp/gpu-versions
echo "Nvidia Version: ${nvhpc_sdk_version}" >> /var/tmp/gpu-versions
echo "GPU Versions File:"
echo "$(</var/tmp/gpu-versions)"
}

gpu_config_nvidia
gpu_config_rocm

0 comments on commit 1e3c64d

Please sign in to comment.