From 83912749f50b1d7437297be770ef9e5654984617 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Tue, 9 Apr 2024 16:30:09 +0000 Subject: [PATCH 1/3] bump CUDA to 12.4.0 --- .github/container/Dockerfile.base | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 70d779640..06a970f1c 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=nvidia/cuda:12.3.0-devel-ubuntu22.04 +ARG BASE_IMAGE=nvidia/cuda:12.4.0-devel-ubuntu22.04 ARG GIT_USER_NAME="JAX Toolbox" ARG GIT_USER_EMAIL=jax@nvidia.com ARG SRC_MANIFEST_FILE=manifest.yaml From 0dc40325444815168d5d7bd653b2a98f8a3d7e3e Mon Sep 17 00:00:00 2001 From: "Yu-Hang \"Maxin\" Tang" Date: Tue, 23 Apr 2024 11:29:12 -0700 Subject: [PATCH 2/3] 12.4.1 --- .github/container/Dockerfile.base | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 4c0e19581..4060eb9d3 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=nvidia/cuda:12.4.0-devel-ubuntu22.04 +ARG BASE_IMAGE=nvidia/cuda:12.4.1-devel-ubuntu22.04 ARG GIT_USER_NAME="JAX Toolbox" ARG GIT_USER_EMAIL=jax@nvidia.com ARG SRC_MANIFEST_FILE=manifest.yaml From 7b356c9f70816cb53bf7d119ac6a164c070ead45 Mon Sep 17 00:00:00 2001 From: "Yu-Hang \"Maxin\" Tang" Date: Tue, 23 Apr 2024 12:13:17 -0700 Subject: [PATCH 3/3] Experiment with IB/OFED libs from distro repo --- .github/container/Dockerfile.base | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 4060eb9d3..763010c35 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -59,6 +59,16 @@ apt_packages=( wget # llvm.sh lsb-release software-properties-common + # OFED for RoCE and InfiniteBand support + rdma-core + libibverbs1 + libibverbs-dev + librdmacm1 + librdmacm-dev + libibumad3 + libibumad-dev + ibverbs-utils + ibverbs-providers ) if [[ $(dpkg --print-architecture) == arm64 ]]; then # h5py: The newest release of of h5py (3.11.0) does not include ARM wheels and causes pip to build h5py. @@ -155,13 +165,6 @@ RUN install-cudnn.sh ADD install-nccl.sh /usr/local/bin RUN install-nccl.sh -############################################################################### -## RoCE and InfiniteBand support -############################################################################### - -ADD install-ofed.sh /usr/local/bin -RUN install-ofed.sh - ############################################################################## ## Amazon EFA support (need to run it inside container separately) ##############################################################################