Skip to content

Commit

Permalink
Update CUDA & MOFED ver
Browse files Browse the repository at this point in the history
  • Loading branch information
B-a-S authored and root committed Dec 13, 2023
1 parent 885bf53 commit 2e038c3
Show file tree
Hide file tree
Showing 12 changed files with 255 additions and 11 deletions.
4 changes: 2 additions & 2 deletions .ci/Dockerfile.centos8
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
ARG CUDA_VER='11.4.2'
ARG CUDA_VER='12.2'
FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base

RUN rm -rf ${SRC_DIR}/ucc
COPY . ${SRC_DIR}/ucc

RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \
sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
RUN yum install -y sudo && \
RUN yum install -y sudo libevent && \
echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
#==============================================================================
# Build UCC
Expand Down
23 changes: 23 additions & 0 deletions .ci/Dockerfile.pytorch
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
ARG CUDA_VER='11.4.2'
FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base
#FROM nvcr.io/nvidia/pytorch:23.10-py3
RUN rm -rf ${SRC_DIR}/ucc
COPY . ${SRC_DIR}/ucc

#RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \
# sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
RUN apt update && apt install -y sudo && \
echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
#==============================================================================
# Build UCC
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh
#==============================================================================
# Install torch_ucc (UCC version) python module and build a wheel package
#RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch_ucc.sh
RUN chown -R 6213:11429 /opt/nvidia
#==============================================================================
RUN groupadd -g 11429 swx-jenkins
RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins
#==============================================================================
USER swx-jenkins

23 changes: 23 additions & 0 deletions .ci/Dockerfile.ubi8
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
ARG CUDA_VER='12.2'
FROM ucc_ubi8:latest

RUN rm -rf ${SRC_DIR}/ucc
COPY . ${SRC_DIR}/ucc

#RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \
# sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
RUN yum install -y sudo && \
echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
#==============================================================================
# Build UCC
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh
#==============================================================================
# Install torch_ucc (UCC version) python module and build a wheel package
RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch_ucc.sh
RUN chown -R 6213:11429 /opt/nvidia
#==============================================================================
RUN groupadd -g 11429 swx-jenkins
RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins
#==============================================================================
USER swx-jenkins

84 changes: 84 additions & 0 deletions .ci/build_base_docker/Dockerfile.ngs.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
ARG CUDA_VER='12.1.1'
FROM nvcr.io/nvidia/pytorch:23.11-py3
#==============================================================================
ARG NVIDIA_ROOT_DIR=/opt/nvidia
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/UTC
ENV SRC_DIR=${NVIDIA_ROOT_DIR}/src
ENV PKG_DIR=${NVIDIA_ROOT_DIR}/pkg
ENV BIN_DIR=${NVIDIA_ROOT_DIR}/bin
ENV WORKLOADS_DIR=${NVIDIA_ROOT_DIR}/workloads
ENV TORCH_UCC_GITHUB_URL=https://github.com/facebookresearch/torch_ucc.git
ENV TORCH_UCC_BRANCH=main
ENV CUDA_HOME=/usr/local/cuda
ENV UCX_GITHUB_URL=https://github.com/openucx/ucx.git
ENV UCX_BRANCH=master
ENV UCX_BUILD_TYPE=release-mt
ENV UCX_INSTALL_DIR=${BIN_DIR}/ucx/build-${UCX_BUILD_TYPE}
ENV UCC_INSTALL_DIR=${BIN_DIR}/ucc/build
ENV OFED_PKG='lsof kmod udev swig libelf1 libfuse2 pciutils tk gfortran libpci3 libusb-1.0-0 libltdl-dev libmnl0 bison tcl flex chrpath debhelper ethtool graphviz'
#'tk pciutils-libs fuse-libs kernel-modules-extra libmnl wget numactl-libs gcc-gfortran'
ENV PACKAGES='numactl openssh-server protobuf-compiler rdma-core vim libevent-dev build-essential git make autoconf libtool'
ENV OS_VERSION=ubuntu22.04
ENV PLATFORM=x86_64
ENV MOFED_VERSION=23.10-0.5.5.0
ENV MOFED_URL="https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-${OS_VERSION}-${PLATFORM}.tgz"
ENV OMPI_PATH="/usr/mpi/gcc/openmpi-4.1.7a1"
#==============================================================================
#COPY .ci/build_base_docker/local.repo /etc/yum.repos.d/local.repo
RUN apt update && apt install -y ${OFED_PKG} && \
mkdir -p /tmp/ofed && wget --quiet -O /tmp/ofed/ofed.tgz ${MOFED_URL} && \
tar -xvf /tmp/ofed/ofed.tgz --strip-components=2 -C /tmp/ofed && \
/tmp/ofed/mlnxofedinstall --user-space-only --without-fw-update -q --distro ${OS_VERSION} --basic && \
rm -rf /tmp/ofed

RUN apt install -y ${PACKAGES}
# update-alternatives --set python3 /usr/bin/python3.8

# Remove old UCX
#RUN rpm -e --nodeps ucx
##ENV PATH=/usr/lib64/openmpi/bin:$PATH
ENV PATH=${OMPI_PATH}/bin:$PATH
RUN echo "export PATH=\"\$OMPI_PATH:\$PATH\"" >> /etc/bashrc && \
export LD_LIBRARY_PATH=\"\$OMPI_PATH/lib64:\${LD_LIBRARY_PATH}\" >> /etc/bashrc
#RUN cd /tmp && wget https://github.com/Kitware/CMake/releases/download/v3.20.4/cmake-3.20.4-linux-x86_64.sh && \
# chmod +x /tmp/cmake-3.20.4-linux-x86_64.sh && /tmp/cmake-3.20.4-linux-x86_64.sh --skip-license --prefix=/usr && \
# rm -f /tmp/cmake-3.20.4-linux-x86_64.sh
#==============================================================================
# Configure SSH
RUN mkdir -p /var/run/sshd && \
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \
ssh-keygen -A && \
rm -f /run/nologin
#==============================================================================

#==============================================================================
RUN mkdir -p ${SRC_DIR} ${PKG_DIR} ${BIN_DIR} ${WORKLOADS_DIR} && \
cd ${SRC_DIR} && \
mkdir -p ${SRC_DIR}/ucx && \
git clone --recursive ${UCX_GITHUB_URL} ${SRC_DIR}/ucx && \
cd ${SRC_DIR}/ucx && \
git checkout ${UCX_BRANCH}

COPY . ${SRC_DIR}/ucc
#==============================================================================
# Build UCX
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucx.sh
ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
#==============================================================================
# Configure Python
#RUN ${SRC_DIR}/ucc/.ci/scripts/configure_python.sh
#==============================================================================
# Install PyTorch
#RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch.sh
#==============================================================================
# Install workloads
WORKDIR ${WORKLOADS_DIR}
RUN git clone https://github.com/facebookresearch/dlrm.git && \
cd ${WORKLOADS_DIR}/dlrm && \
pip3 install -r ${WORKLOADS_DIR}/dlrm/requirements.txt && \
pip3 install tensorboard
RUN git clone https://github.com/facebookresearch/param.git && \
pip3 install -r ${WORKLOADS_DIR}/param/requirements.txt
85 changes: 85 additions & 0 deletions .ci/build_base_docker/Dockerfile.ubi8.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
ARG CUDA_VER='12.1.1'
FROM nvidia/cuda:${CUDA_VER}-devel-ubi8
#==============================================================================
ARG NVIDIA_ROOT_DIR=/opt/nvidia
ENV SRC_DIR=${NVIDIA_ROOT_DIR}/src
ENV PKG_DIR=${NVIDIA_ROOT_DIR}/pkg
ENV BIN_DIR=${NVIDIA_ROOT_DIR}/bin
ENV WORKLOADS_DIR=${NVIDIA_ROOT_DIR}/workloads
ENV TORCH_UCC_GITHUB_URL=https://github.com/facebookresearch/torch_ucc.git
ENV TORCH_UCC_BRANCH=main
ENV CUDA_HOME=/usr/local/cuda
ENV UCX_GITHUB_URL=https://github.com/openucx/ucx.git
ENV UCX_BRANCH=master
ENV UCX_BUILD_TYPE=release-mt
ENV UCX_INSTALL_DIR=${BIN_DIR}/ucx/build-${UCX_BUILD_TYPE}
ENV UCC_INSTALL_DIR=${BIN_DIR}/ucc/build
ENV OFED_PKG 'python36 tk pciutils-libs fuse-libs kernel-modules-extra libmnl wget numactl-libs gcc-gfortran'
ENV PACKAGES 'numactl numactl-devel openssh-server protobuf-compiler protobuf-devel python3.8 python38-devel vim openmpi openmpi-devel hostname'
ENV OS_VERSION rhel8.0
ENV PLATFORM x86_64
ENV MOFED_VERSION 23.10-0.5.5.0
ENV MOFED_URL="https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-${OS_VERSION}-${PLATFORM}.tgz"
ENV OMPI_PATH "/usr/lib64/openmpi"
#==============================================================================
COPY .ci/build_base_docker/local.repo /etc/yum.repos.d/local.repo
RUN yum groupinstall -y \
'Development Tools' && \
yum install -y ${OFED_PKG} && \
mkdir -p /tmp/ofed && wget --quiet -O /tmp/ofed/ofed.tgz ${MOFED_URL} && \
tar -xvf /tmp/ofed/ofed.tgz --strip-components=2 -C /tmp/ofed && \
/tmp/ofed/mlnxofedinstall --user-space-only --without-fw-update --basic -q --distro ${OS_VERSION} && \
rm -rf /tmp/ofed

RUN yum install -y ${PACKAGES} && \
update-alternatives --set python3 /usr/bin/python3.8

# Remove old UCX
RUN rpm -e --nodeps ucx
#ENV PATH=/usr/lib64/openmpi/bin:$PATH
ENV PATH=${OMPI_PATH}/bin:$PATH
RUN echo "export PATH=\"\$OMPI_PATH:\$PATH\"" >> /etc/bashrc && \
export LD_LIBRARY_PATH=\"\$OMPI_PATH/lib:\${LD_LIBRARY_PATH}\" >> /etc/bashrc
RUN cd /tmp && wget https://github.com/Kitware/CMake/releases/download/v3.20.4/cmake-3.20.4-linux-x86_64.sh && \
chmod +x /tmp/cmake-3.20.4-linux-x86_64.sh && /tmp/cmake-3.20.4-linux-x86_64.sh --skip-license --prefix=/usr && \
rm -f /tmp/cmake-3.20.4-linux-x86_64.sh
#==============================================================================
# Configure SSH
RUN mkdir -p /var/run/sshd && \
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \
ssh-keygen -A && \
rm -f /run/nologin
#==============================================================================

#==============================================================================
RUN mkdir -p ${SRC_DIR} ${PKG_DIR} ${BIN_DIR} ${WORKLOADS_DIR} && \
git clone ${TORCH_UCC_GITHUB_URL} ${SRC_DIR} && \
cd ${SRC_DIR} && \
git checkout ${TORCH_UCC_BRANCH} && \
mkdir -p ${SRC_DIR}/ucx && \
git clone --recursive ${UCX_GITHUB_URL} ${SRC_DIR}/ucx && \
cd ${SRC_DIR}/ucx && \
git checkout ${UCX_BRANCH}

COPY . ${SRC_DIR}/ucc
#==============================================================================
# Build UCX
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucx.sh
ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
#==============================================================================
# Configure Python
RUN ${SRC_DIR}/ucc/.ci/scripts/configure_python.sh
#==============================================================================
# Install PyTorch
RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch.sh
#==============================================================================
# Install workloads
WORKDIR ${WORKLOADS_DIR}
RUN git clone https://github.com/facebookresearch/dlrm.git && \
cd ${WORKLOADS_DIR}/dlrm && \
pip3 install -r ${WORKLOADS_DIR}/dlrm/requirements.txt && \
pip3 install tensorboard
RUN git clone https://github.com/facebookresearch/param.git && \
pip3 install -r ${WORKLOADS_DIR}/param/requirements.txt
16 changes: 16 additions & 0 deletions .ci/build_base_docker/local.repo
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[Local_appStream]
baseurl=http://webrepo/RH/rh-mirrors/8-upstream/appstream/x86_64/
enabled=1
gpgcheck=0
[Local_BaseOs]
baseurl=http://webrepo/RH/rh-mirrors/8-upstream/baseos/x86_64/
enabled=1
gpgcheck=0
[Local_Builder]
baseurl=http://webrepo/RH/rh-mirrors/8-upstream/codeready-builder/x86_64/
enabled=1
gpgcheck=0
[Local_High]
baseurl=http://webrepo/RH/rh-mirrors/8-upstream/highavailability/x86_64/
enabled=1
gpgcheck=0
2 changes: 1 addition & 1 deletion .ci/job_matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ volumes:
}

env:
CUDA_VER: '11.4.2'
CUDA_VER: '12.1.1'
UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}"
UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}"
NVIDIA_ROOT_DIR: "/opt/nvidia"
Expand Down
1 change: 1 addition & 0 deletions .ci/scripts/.#install_torch_ucc.sh
2 changes: 1 addition & 1 deletion .ci/scripts/build_ucc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ cd "${UCC_SRC_DIR}"
"${UCC_SRC_DIR}/autogen.sh"
mkdir -p "${UCC_SRC_DIR}/build"
cd "${UCC_SRC_DIR}/build"
"${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" \
"${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" --with-nvcc-gencode="-gencode=arch=compute_70,code=sm_70" \
--prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi
make -j install
echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf
Expand Down
7 changes: 5 additions & 2 deletions .ci/scripts/install_torch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@ set -o pipefail
#conda uninstall -y pytorch torchvision
#conda install pytorch torchvision cudatoolkit=11.0 -c pytorch-nightly
#conda install pytorch cudatoolkit=11.0 -c pytorch-nightly

ls /usr/local/lib64/python3.8/dist-packages/torch/lib -la
ls -la /usr/local/lib64/python3.8/
pip3 install --default-timeout=900 numpy
pip3 install --default-timeout=900 --pre torch -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html
#pip3 install torch torchvision torchaudio
pip3 install --default-timeout=900 --pre torch -f https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html
#pip3 install --default-timeout=900 --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
pip3 install "git+https://github.com/mlperf/logging.git@0.7.1"
2 changes: 2 additions & 0 deletions .ci/scripts/install_torch_ucc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ export UCX_HOME=${UCX_INSTALL_DIR}
export UCC_HOME=${UCC_INSTALL_DIR}
export WITH_CUDA=${CUDA_HOME}
cd "${SRC_DIR}"
ls -la /usr/local/lib64/python3.8/site-packages/torch*
#ls /usr/local/lib64/python3.8/dist-packages/torch/lib -la
python setup.py install bdist_wheel
pip3 list | grep torch
python -c 'import torch, torch_ucc'
17 changes: 12 additions & 5 deletions test/gtest/core/test_mc_reduce.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ class test_mc_reduce : public testing::Test {
std::cerr << "failed to destory cuda stream" << std::endl;
return UCC_ERR_NO_MESSAGE;
}
ee_context = NULL;
}
#endif
return status;
Expand All @@ -110,11 +111,11 @@ class test_mc_reduce : public testing::Test {
{
ucc_status_t status;

status = alloc_executor(mtype);
status = alloc_bufs(mtype, n);
if (UCC_OK != status) {
return status;
}
return alloc_bufs(mtype, n);
return alloc_executor(mtype);
}

ucc_status_t alloc_bufs(ucc_memory_type_t mtype, size_t n)
Expand Down Expand Up @@ -192,9 +193,6 @@ class test_mc_reduce : public testing::Test {
virtual void TearDown() override
{
free_bufs(mem_type);
if (executor) {
free_executor();
}
ucc_mc_finalize();
}

Expand Down Expand Up @@ -246,6 +244,9 @@ class test_mc_reduce : public testing::Test {
GTEST_SKIP();
}
ASSERT_EQ(status, UCC_OK);
if (executor) {
free_executor();
}

if (mt != UCC_MEMORY_TYPE_HOST) {
ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d),
Expand All @@ -272,6 +273,9 @@ class test_mc_reduce : public testing::Test {
GTEST_SKIP();
}
ASSERT_EQ(status, UCC_OK);
if (executor) {
free_executor();
}

if (mt != UCC_MEMORY_TYPE_HOST) {
ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d),
Expand Down Expand Up @@ -305,6 +309,9 @@ class test_mc_reduce : public testing::Test {
GTEST_SKIP();
}
ASSERT_EQ(status, UCC_OK);
if (executor) {
free_executor();
}

if (mt != UCC_MEMORY_TYPE_HOST) {
ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d),
Expand Down

0 comments on commit 2e038c3

Please sign in to comment.