NVIDIA · Steboss · Jan 14, 2025 · Jan 14, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base
@@ -1,27 +1,10 @@
 # syntax=docker/dockerfile:1-labs
-ARG BASE_IMAGE=nvidia/cuda:12.6.3-devel-ubuntu24.04
+ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04
 ARG GIT_USER_NAME="JAX Toolbox"
 ARG GIT_USER_EMAIL=jax@nvidia.com
 ARG CLANG_VERSION=18
 ARG JAX_TOOLBOX_REF
 
-###############################################################################
-## Obtain GCP's NCCL TCPx plugin
-###############################################################################
-
-FROM us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx:v3.1.10 AS tcpx-installer-amd64
-
-# make a stub arm64 container because GCP does not provide an arm64 version of the plugin
-FROM ubuntu AS tcpx-installer-arm64
-RUN <<"OUTEREOF" bash -ex
-mkdir -p /scripts /var/lib/tcpx/lib64
-echo '#!/bin/bash' > /scripts/container_entry.sh
-chmod +x /scripts/container_entry.sh
-OUTEREOF
-
-FROM tcpx-installer-${TARGETARCH} AS tcpx-installer
-RUN /scripts/container_entry.sh install
-
 ###############################################################################
 ## Build base image
 ###############################################################################
@@ -153,72 +136,20 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1
 RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/*
 
 ###############################################################################
-## Install TCPx
-###############################################################################
-
-ENV TCPX_LIBRARY_PATH=/usr/local/tcpx/lib64
-COPY --from=tcpx-installer /var/lib/tcpx/lib64 ${TCPX_LIBRARY_PATH}
-
-###############################################################################
-## Install the latest versions of Nsight Systems and Nsight Compute
-###############################################################################
-
-ADD install-nsight.sh /usr/local/bin
-RUN install-nsight.sh
-
-###############################################################################
-## Install cuDNN
+## Symlink for cuDNN
 ###############################################################################
 
 ADD install-cudnn.sh /usr/local/bin
 RUN install-cudnn.sh
 
 ###############################################################################
-## Install NCCL
+## Symlink for NCCL
 ###############################################################################
 
+# same fro this 
 ADD install-nccl.sh /usr/local/bin
 RUN install-nccl.sh
 
-###############################################################################
-## RoCE and InfiniteBand support
-###############################################################################
-
-ADD install-ofed.sh /usr/local/bin
-RUN install-ofed.sh
-
-##############################################################################
-## Amazon EFA support (need to run it inside container separately)
-##############################################################################
-
-ADD --chmod=777 \
-  install-efa.sh \
-  test-aws-efa.sh \
-  /usr/local/bin/
-ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:${LD_LIBRARY_PATH}
-ENV PATH=/opt/amazon/efa/bin:${PATH}
-
-##############################################################################
-## NCCL sanity check utility
-##############################################################################
-
-ADD install-nccl-sanity-check.sh /usr/local/bin
-ADD nccl-sanity-check.cu /opt
-RUN install-nccl-sanity-check.sh
-ADD jax-nccl-test parallel-launch /usr/local/bin/
-
-###############################################################################
-## Add the systemcheck to the entrypoint.
-###############################################################################
-
-COPY check-shm.sh /opt/nvidia/entrypoint.d/
-
-###############################################################################
-## Add the GCP - TCPX check to the entrypoint.
-###############################################################################
-
-# TODO(chaserileyroberts): Reenable once fully tested on GCP.
-# COPY gcp-autoconfig.sh /opt/nvidia/entrypoint.d/
 
 ###############################################################################
 ## Install the nsys-jax JAX/XLA-aware profiling scripts, patch Nsight Systems

diff --git a/.github/container/install-cudnn.sh b/.github/container/install-cudnn.sh
@@ -2,46 +2,6 @@
 
 set -ex
 
-export DEBIAN_FRONTEND=noninteractive
-export TZ=America/Los_Angeles
-
-CUDNN_MAJOR_VERSION=9
-
-apt-get update
-
-# Extract major CUDA version from `nvcc --version` output line
-# Input: "Cuda compilation tools, release X.Y, VX.Y.Z"
-# Output: X
-cuda_major_version=$(nvcc --version | sed -n 's/^.*release \([0-9]*\.[0-9]*\).*$/\1/p' | cut -d. -f1)
-
-# Find latest cuDNN version compatible with existing CUDA by matching
-# ${cuda_major_version} in the package version string
-# In most cases cuDNN release is behind CUDA ones. It is considered, that major 
-# version of CUDA and cuDNN are compatible.
-# For example, CUDA 12.3 + cuDNN 8.9.6 (libcudnn8 version: 8.9.6.50-1+cuda12.2) is 
-# considered to be compatible.
-if [[ ${CUDNN_MAJOR_VERSION} -le 8 ]]; then
-    libcudnn_name=libcudnn${CUDNN_MAJOR_VERSION}
-    libcudnn_dev_name=libcudnn${CUDNN_MAJOR_VERSION}-dev
-    version_pattern="s/^Version: \(.*+cuda${cuda_major_version}\.[0-9]*\)$/\1/p"
-elif [[ ${CUDNN_MAJOR_VERSION} -eq 9 ]]; then
-    libcudnn_name=libcudnn${CUDNN_MAJOR_VERSION}-cuda-${cuda_major_version}
-    libcudnn_dev_name=libcudnn${CUDNN_MAJOR_VERSION}-dev-cuda-${cuda_major_version}
-    version_pattern="s/^Version: \(${CUDNN_MAJOR_VERSION}\.[0-9.-]*\)$/\1/p"
-fi
-libcudnn_version=$(apt-cache show $libcudnn_name |  sed -n "$version_pattern" | head -n 1)
-libcudnn_dev_version=$(apt-cache show $libcudnn_dev_name | sed -n "$version_pattern" | head -n 1)
-if [[ -z "${libcudnn_version}" || -z "${libcudnn_dev_version}" ]]; then
-    echo "Could not find compatible cuDNN version for CUDA ${cuda_version}"
-    exit 1
-fi
-
-apt-get install -y \
-    ${libcudnn_name}=${libcudnn_version} \
-    ${libcudnn_dev_name}=${libcudnn_dev_version}
-apt-get clean
-rm -rf /var/lib/apt/lists/*
-
 # Create a prefix with include/ and lib/ directories containing symlinks to the cuDNN
 # version that was just installed; this is useful to pass to XLA to avoid it fetching
 # its own copy of cuDNN.
@@ -50,8 +10,15 @@ if [[ -d "${prefix}" ]]; then
   echo "Skipping link farm creation"
   exit 1
 fi
+
 arch=$(uname -m)-linux-gnu
-for cudnn_file in $(dpkg -L ${libcudnn_name} ${libcudnn_dev_name} | sort -u); do
+libcudnn_pkgs=$(dpkg -l 'libcudnn*' | awk '/^ii/ {print $2}')
+if [[ -z "${libcudnn_pkgs}" ]]; then
+  echo "No libcudnn packages installed."
+  exit 1
+fi
+
+for cudnn_file in $(dpkg -L ${libcudnn_pkgs} | sort -u); do
   # Real files and symlinks are linked into $prefix
   if [[ -f "${cudnn_file}" || -h "${cudnn_file}" ]]; then
     # Replace /usr with $prefix
@@ -70,3 +37,26 @@ for cudnn_file in $(dpkg -L ${libcudnn_name} ${libcudnn_dev_name} | sort -u); do
     echo "Skipping ${cudnn_file}"
   fi
 done
+
+# replicate the original symlinks too, so we'll have /opt/nvidia/cudnn/include/cudnn.sh
+find /usr/include -maxdepth 1 -name "cudnn*.h" -type l | while read -r symlink; do
+  symlink_name=$(basename "${symlink}")
+  symlink_target=$(readlink "${symlink}")
+  # Check if the symlink points to x86_64-linux-gnu/
+  if [[ "${symlink_target}" == "${arch}/"* ]]; then
+    # Adjust the symlink target to point within our symlink directory
+    adjusted_target="${prefix}/include/${symlink_target#${arch}/}"
+    # Destination symlink within the symlink directory
+    link_name="${prefix}/include/${symlink_name}"
+    link_dir=$(dirname "${link_name}")
+    mkdir -p "${link_dir}"
+    # Check if the symlink already exists
+    if [[ -e "${link_name}" ]]; then
+      echo "Symlink ${link_name} already exists. Skipping."
+    else
+      ln -s "${adjusted_target}" "${link_name}"
+    fi
+  else
+    echo "Skipping symlink ${symlink} with target ${symlink_target}"
+  fi
+done
diff --git a/.github/container/install-nccl.sh b/.github/container/install-nccl.sh
@@ -2,37 +2,6 @@
 
 set -ex -o pipefail
 
-export DEBIAN_FRONTEND=noninteractive
-export TZ=America/Los_Angeles
-
-# If NCCL is already installed, don't reinstall it. Print a message and exit
-if dpkg -s libnccl2 libnccl-dev &> /dev/null; then
-    echo "NCCL is already installed. Skipping installation."
-else
-    apt-get update
-
-    # Extract CUDA version from `nvcc --version` output line
-    # Input: "Cuda compilation tools, release X.Y, VX.Y.Z"
-    # Output: X.Y
-    cuda_version=$(nvcc --version | sed -n 's/^.*release \([0-9]*\.[0-9]*\).*$/\1/p')
-
-    # Find latest NCCL version compatible with existing CUDA by matching
-    # ${cuda_version} in the package version string
-    libnccl2_version=$(apt-cache show libnccl-dev | sed -n "s/^Version: \(.*+cuda${cuda_version}\)$/\1/p" | head -n 1)
-    libnccl_dev_version=$(apt-cache show libnccl-dev | sed -n "s/^Version: \(.*+cuda${cuda_version}\)$/\1/p" | head -n 1)
-    if [[ -z "${libnccl2_version}" || -z "${libnccl_dev_version}" ]]; then
-        echo "Could not find compatible NCCL version for CUDA ${cuda_version}"
-        exit 1
-    fi
-
-    apt-get install -y \
-        libnccl2=${libnccl2_version} \
-        libnccl-dev=${libnccl_dev_version}
-
-    apt-get clean
-    rm -rf /var/lib/apt/lists/*
-fi
-
 # Create a prefix with include/ and lib/ directories containing symlinks to the NCCL
 # version installed at the system level; this is useful to pass to XLA to avoid it
 # fetching its own copy.
@@ -42,7 +11,15 @@ if [[ -d "${prefix}" ]]; then
   exit 1
 fi
 arch=$(uname -m)-linux-gnu
-for nccl_file in $(dpkg -L libnccl2 libnccl-dev | sort -u); do
+nccl_packages=$(dpkg -l 'libnccl*' | awk '/^ii/ {print $2}')
+
+if [[ -z "${nccl_packages}" ]]; then
+  echo "No NCCL packages installed."
+  exit 1
+fi
+
+
+for nccl_file in $(dpkg -L ${nccl_packages} | sort -u); do
   # Real files and symlinks are linked into $prefix
   if [[ -f "${nccl_file}" || -h "${nccl_file}" ]]; then
     # Replace /usr with $prefix and remove arch-specific lib directories