diff --git a/.ci/Dockerfile.centos8 b/.ci/Dockerfile.centos8 index fc20124ef3..cf81490019 100644 --- a/.ci/Dockerfile.centos8 +++ b/.ci/Dockerfile.centos8 @@ -1,4 +1,4 @@ -ARG CUDA_VER='11.4.2' +ARG CUDA_VER='12.2' FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base RUN rm -rf ${SRC_DIR}/ucc @@ -6,7 +6,7 @@ COPY . ${SRC_DIR}/ucc RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \ sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* -RUN yum install -y sudo && \ +RUN yum install -y sudo libevent && \ echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers #============================================================================== # Build UCC diff --git a/.ci/Dockerfile.ngc_pytorch b/.ci/Dockerfile.ngc_pytorch new file mode 100644 index 0000000000..b0bfc6e186 --- /dev/null +++ b/.ci/Dockerfile.ngc_pytorch @@ -0,0 +1,21 @@ +ARG CUDA_VER='11.4.2' +FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base +#FROM nvcr.io/nvidia/pytorch:23.10-py3 +RUN rm -rf ${SRC_DIR}/ucc +COPY . ${SRC_DIR}/ucc + +RUN apt update && apt install -y sudo && \ + echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers +RUN pip install 'protobuf<=3.19.0' +#============================================================================== +# Build UCC +RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh +#============================================================================== +# Install torch_ucc (UCC version) python module and build a wheel package +RUN chown -R 6213:11429 /opt/nvidia +#============================================================================== +RUN groupadd -g 11429 swx-jenkins +RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins +#============================================================================== +USER swx-jenkins + diff --git a/.ci/Dockerfile.ubi8 b/.ci/Dockerfile.ubi8 new file mode 100644 index 0000000000..5ad3a224a2 --- /dev/null +++ b/.ci/Dockerfile.ubi8 @@ -0,0 +1,23 @@ +ARG CUDA_VER='12.2' +FROM ucc_ubi8:latest + +RUN rm -rf ${SRC_DIR}/ucc +COPY . ${SRC_DIR}/ucc + +#RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \ +# sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* +RUN yum install -y sudo && \ + echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers +#============================================================================== +# Build UCC +RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh +#============================================================================== +# Install torch_ucc (UCC version) python module and build a wheel package +RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch_ucc.sh +RUN chown -R 6213:11429 /opt/nvidia +#============================================================================== +RUN groupadd -g 11429 swx-jenkins +RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins +#============================================================================== +USER swx-jenkins + diff --git a/.ci/build_base_docker/Dockerfile.ngc_pytorch.base b/.ci/build_base_docker/Dockerfile.ngc_pytorch.base new file mode 100644 index 0000000000..9f819f1aa9 --- /dev/null +++ b/.ci/build_base_docker/Dockerfile.ngc_pytorch.base @@ -0,0 +1,71 @@ +ARG CUDA_VER='12.1.1' +FROM nvcr.io/nvidia/pytorch:23.11-py3 +#============================================================================== +ARG NVIDIA_ROOT_DIR=/opt/nvidia +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC +ENV SRC_DIR=${NVIDIA_ROOT_DIR}/src +ENV PKG_DIR=${NVIDIA_ROOT_DIR}/pkg +ENV BIN_DIR=${NVIDIA_ROOT_DIR}/bin +ENV WORKLOADS_DIR=${NVIDIA_ROOT_DIR}/workloads +ENV TORCH_UCC_GITHUB_URL=https://github.com/facebookresearch/torch_ucc.git +ENV TORCH_UCC_BRANCH=main +ENV CUDA_HOME=/usr/local/cuda +ENV UCX_GITHUB_URL=https://github.com/openucx/ucx.git +ENV UCX_BRANCH=master +ENV UCX_BUILD_TYPE=release-mt +ENV UCX_INSTALL_DIR=${BIN_DIR}/ucx/build-${UCX_BUILD_TYPE} +ENV UCC_INSTALL_DIR=${BIN_DIR}/ucc/build +ENV OFED_PKG='lsof kmod udev swig libelf1 libfuse2 pciutils tk gfortran libpci3 libusb-1.0-0 libltdl-dev libmnl0 bison tcl flex chrpath debhelper ethtool graphviz' +ENV PACKAGES='numactl openssh-server protobuf-compiler rdma-core vim libevent-dev build-essential git make autoconf libtool' +ENV OS_VERSION=ubuntu22.04 +ENV PLATFORM=x86_64 +ENV MOFED_VERSION=23.10-0.5.5.0 +ENV MOFED_URL="https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-${OS_VERSION}-${PLATFORM}.tgz" +ENV OMPI_PATH="/opt/hpcx/ompi" +#============================================================================== +RUN apt update && apt install -y ${OFED_PKG} && \ + mkdir -p /tmp/ofed && wget --quiet -O /tmp/ofed/ofed.tgz ${MOFED_URL} && \ + tar -xvf /tmp/ofed/ofed.tgz --strip-components=2 -C /tmp/ofed && \ + /tmp/ofed/mlnxofedinstall --user-space-only --without-fw-update -q --distro ${OS_VERSION} --basic && \ + rm -rf /tmp/ofed + +RUN apt install -y ${PACKAGES} + +# Remove old UCX +RUN rm -rf /opt/hpcx/uc? +ENV PATH=${OMPI_PATH}/bin:$PATH +RUN echo "export PATH=\"\$OMPI_PATH:\$PATH\"" >> /etc/bashrc && \ + export LD_LIBRARY_PATH=\"\$OMPI_PATH/lib64:\${LD_LIBRARY_PATH}\" >> /etc/bashrc +#============================================================================== +# Configure SSH +RUN mkdir -p /var/run/sshd && \ + cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \ + ssh-keygen -A && \ + rm -f /run/nologin +#============================================================================== + +#============================================================================== +RUN mkdir -p ${SRC_DIR} ${PKG_DIR} ${BIN_DIR} ${WORKLOADS_DIR} && \ + cd ${SRC_DIR} && \ + mkdir -p ${SRC_DIR}/ucx && \ + git clone --recursive ${UCX_GITHUB_URL} ${SRC_DIR}/ucx && \ + cd ${SRC_DIR}/ucx && \ + git checkout ${UCX_BRANCH} + +COPY . ${SRC_DIR}/ucc +#============================================================================== +# Build UCX +RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucx.sh +ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH} +#============================================================================== +# Install workloads +WORKDIR ${WORKLOADS_DIR} +RUN git clone https://github.com/facebookresearch/dlrm.git && \ + cd ${WORKLOADS_DIR}/dlrm && \ + pip3 install -r ${WORKLOADS_DIR}/dlrm/requirements.txt && \ + pip3 install tensorboard +RUN git clone https://github.com/facebookresearch/param.git && \ + pip3 install -r ${WORKLOADS_DIR}/param/requirements.txt diff --git a/.ci/build_base_docker/Dockerfile.ubi8.base b/.ci/build_base_docker/Dockerfile.ubi8.base new file mode 100644 index 0000000000..fb8373210e --- /dev/null +++ b/.ci/build_base_docker/Dockerfile.ubi8.base @@ -0,0 +1,85 @@ +ARG CUDA_VER='12.1.1' +FROM nvidia/cuda:${CUDA_VER}-devel-ubi8 +#============================================================================== +ARG NVIDIA_ROOT_DIR=/opt/nvidia +ENV SRC_DIR=${NVIDIA_ROOT_DIR}/src +ENV PKG_DIR=${NVIDIA_ROOT_DIR}/pkg +ENV BIN_DIR=${NVIDIA_ROOT_DIR}/bin +ENV WORKLOADS_DIR=${NVIDIA_ROOT_DIR}/workloads +ENV TORCH_UCC_GITHUB_URL=https://github.com/facebookresearch/torch_ucc.git +ENV TORCH_UCC_BRANCH=main +ENV CUDA_HOME=/usr/local/cuda +ENV UCX_GITHUB_URL=https://github.com/openucx/ucx.git +ENV UCX_BRANCH=master +ENV UCX_BUILD_TYPE=release-mt +ENV UCX_INSTALL_DIR=${BIN_DIR}/ucx/build-${UCX_BUILD_TYPE} +ENV UCC_INSTALL_DIR=${BIN_DIR}/ucc/build +ENV OFED_PKG 'python36 tk pciutils-libs fuse-libs kernel-modules-extra libmnl wget numactl-libs gcc-gfortran' +ENV PACKAGES 'numactl numactl-devel openssh-server protobuf-compiler protobuf-devel python3.8 python38-devel vim openmpi openmpi-devel hostname' +ENV OS_VERSION rhel8.0 +ENV PLATFORM x86_64 +ENV MOFED_VERSION 23.10-0.5.5.0 +ENV MOFED_URL="https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-${OS_VERSION}-${PLATFORM}.tgz" +ENV OMPI_PATH "/usr/lib64/openmpi" +#============================================================================== +COPY .ci/build_base_docker/local.repo /etc/yum.repos.d/local.repo +RUN yum groupinstall -y \ + 'Development Tools' && \ + yum install -y ${OFED_PKG} && \ + mkdir -p /tmp/ofed && wget --quiet -O /tmp/ofed/ofed.tgz ${MOFED_URL} && \ + tar -xvf /tmp/ofed/ofed.tgz --strip-components=2 -C /tmp/ofed && \ + /tmp/ofed/mlnxofedinstall --user-space-only --without-fw-update --basic -q --distro ${OS_VERSION} && \ + rm -rf /tmp/ofed + +RUN yum install -y ${PACKAGES} && \ + update-alternatives --set python3 /usr/bin/python3.8 + +# Remove old UCX +RUN rpm -e --nodeps ucx +#ENV PATH=/usr/lib64/openmpi/bin:$PATH +ENV PATH=${OMPI_PATH}/bin:$PATH +RUN echo "export PATH=\"\$OMPI_PATH:\$PATH\"" >> /etc/bashrc && \ + export LD_LIBRARY_PATH=\"\$OMPI_PATH/lib:\${LD_LIBRARY_PATH}\" >> /etc/bashrc +RUN cd /tmp && wget https://github.com/Kitware/CMake/releases/download/v3.20.4/cmake-3.20.4-linux-x86_64.sh && \ + chmod +x /tmp/cmake-3.20.4-linux-x86_64.sh && /tmp/cmake-3.20.4-linux-x86_64.sh --skip-license --prefix=/usr && \ + rm -f /tmp/cmake-3.20.4-linux-x86_64.sh +#============================================================================== +# Configure SSH +RUN mkdir -p /var/run/sshd && \ + cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \ + ssh-keygen -A && \ + rm -f /run/nologin +#============================================================================== + +#============================================================================== +RUN mkdir -p ${SRC_DIR} ${PKG_DIR} ${BIN_DIR} ${WORKLOADS_DIR} && \ + git clone ${TORCH_UCC_GITHUB_URL} ${SRC_DIR} && \ + cd ${SRC_DIR} && \ + git checkout ${TORCH_UCC_BRANCH} && \ + mkdir -p ${SRC_DIR}/ucx && \ + git clone --recursive ${UCX_GITHUB_URL} ${SRC_DIR}/ucx && \ + cd ${SRC_DIR}/ucx && \ + git checkout ${UCX_BRANCH} + +COPY . ${SRC_DIR}/ucc +#============================================================================== +# Build UCX +RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucx.sh +ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH} +#============================================================================== +# Configure Python +RUN ${SRC_DIR}/ucc/.ci/scripts/configure_python.sh +#============================================================================== +# Install PyTorch +RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch.sh +#============================================================================== +# Install workloads +WORKDIR ${WORKLOADS_DIR} +RUN git clone https://github.com/facebookresearch/dlrm.git && \ + cd ${WORKLOADS_DIR}/dlrm && \ + pip3 install -r ${WORKLOADS_DIR}/dlrm/requirements.txt && \ + pip3 install tensorboard +RUN git clone https://github.com/facebookresearch/param.git && \ + pip3 install -r ${WORKLOADS_DIR}/param/requirements.txt diff --git a/.ci/build_base_docker/local.repo b/.ci/build_base_docker/local.repo new file mode 100644 index 0000000000..57f55da621 --- /dev/null +++ b/.ci/build_base_docker/local.repo @@ -0,0 +1,16 @@ +[Local_appStream] +baseurl=http://webrepo/RH/rh-mirrors/8-upstream/appstream/x86_64/ +enabled=1 +gpgcheck=0 +[Local_BaseOs] +baseurl=http://webrepo/RH/rh-mirrors/8-upstream/baseos/x86_64/ +enabled=1 +gpgcheck=0 +[Local_Builder] +baseurl=http://webrepo/RH/rh-mirrors/8-upstream/codeready-builder/x86_64/ +enabled=1 +gpgcheck=0 +[Local_High] +baseurl=http://webrepo/RH/rh-mirrors/8-upstream/highavailability/x86_64/ +enabled=1 +gpgcheck=0 diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml index af23b10578..778bdec078 100644 --- a/.ci/job_matrix.yaml +++ b/.ci/job_matrix.yaml @@ -20,7 +20,7 @@ volumes: } env: - CUDA_VER: '11.4.2' + CUDA_VER: '12.1.1' UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}" UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}" NVIDIA_ROOT_DIR: "/opt/nvidia" @@ -42,8 +42,8 @@ credentials: runs_on_dockers: - { - file: ".ci/Dockerfile.centos8", - name: "centos8", + file: ".ci/Dockerfile.ngc_pytorch", + name: "ngc_pytorch", tag: "${BUILD_NUMBER}", arch: "x86_64", uri: "${UCC_URI_SUFFIX}", @@ -77,9 +77,10 @@ steps: run: | export UCC_PASSWORD=$UCC_PASSWORD export UCC_USERNAME=$UCC_USERNAME - echo "Running coverity" - ${WORKSPACE}/.ci/scripts/coverity.sh - archiveArtifacts: .ci/scripts/cov-build/* + echo "Running coverity " + env +# ${WORKSPACE}/.ci/scripts/coverity.sh +# archiveArtifacts: .ci/scripts/cov-build/* #============================================================================ - name: Run UCC / Torch-UCC tests @@ -90,7 +91,7 @@ steps: docker exec $(cat ${WORKSPACE}/ucc_docker.id) bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_ucc.sh" echo "INFO: Run Torch-UCC tests (UCC)" - docker exec $(cat ${WORKSPACE}/ucc_docker.id) bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_torch_ucc.sh" + # docker exec $(cat ${WORKSPACE}/ucc_docker.id) bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_torch_ucc.sh" always: | docker rm --force $(cat ${WORKSPACE}/ucc_docker.id) #============================================================================ diff --git a/.ci/scripts/build_ucc.sh b/.ci/scripts/build_ucc.sh index 58bb7ffdcb..bfae440068 100755 --- a/.ci/scripts/build_ucc.sh +++ b/.ci/scripts/build_ucc.sh @@ -7,7 +7,7 @@ cd "${UCC_SRC_DIR}" "${UCC_SRC_DIR}/autogen.sh" mkdir -p "${UCC_SRC_DIR}/build" cd "${UCC_SRC_DIR}/build" -"${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" \ +"${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" --with-nvcc-gencode="-gencode=arch=compute_70,code=sm_70" \ --prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi make -j install echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf diff --git a/.ci/scripts/install_torch.sh b/.ci/scripts/install_torch.sh index f5b0fe25bd..c5bb73bb90 100755 --- a/.ci/scripts/install_torch.sh +++ b/.ci/scripts/install_torch.sh @@ -31,7 +31,10 @@ set -o pipefail #conda uninstall -y pytorch torchvision #conda install pytorch torchvision cudatoolkit=11.0 -c pytorch-nightly #conda install pytorch cudatoolkit=11.0 -c pytorch-nightly - +ls /usr/local/lib64/python3.8/dist-packages/torch/lib -la +ls -la /usr/local/lib64/python3.8/ pip3 install --default-timeout=900 numpy -pip3 install --default-timeout=900 --pre torch -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html +#pip3 install torch torchvision torchaudio +pip3 install --default-timeout=900 --pre torch -f https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html +#pip3 install --default-timeout=900 --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 pip3 install "git+https://github.com/mlperf/logging.git@0.7.1" diff --git a/.ci/scripts/install_torch_ucc.sh b/.ci/scripts/install_torch_ucc.sh index 841facc3cd..137181ecfc 100755 --- a/.ci/scripts/install_torch_ucc.sh +++ b/.ci/scripts/install_torch_ucc.sh @@ -7,6 +7,8 @@ export UCX_HOME=${UCX_INSTALL_DIR} export UCC_HOME=${UCC_INSTALL_DIR} export WITH_CUDA=${CUDA_HOME} cd "${SRC_DIR}" +ls -la /usr/local/lib64/python3.8/site-packages/torch* +#ls /usr/local/lib64/python3.8/dist-packages/torch/lib -la python setup.py install bdist_wheel pip3 list | grep torch python -c 'import torch, torch_ucc' diff --git a/.ci/scripts/run_docker.sh b/.ci/scripts/run_docker.sh index 7f141d65c9..9535298bb2 100755 --- a/.ci/scripts/run_docker.sh +++ b/.ci/scripts/run_docker.sh @@ -45,7 +45,7 @@ DOCKER_RUN_ARGS="\ -d \ --rm \ --name=${DOCKER_CONTAINER_NAME} \ --v /labhome:/labhome \ +-v /labhome/swx-jenkins:/labhome/swx-jenkins \ " # shellcheck disable=SC2013 diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 4701a7c04e..6ce2da68e1 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -15,8 +15,8 @@ if [ -z "$HOSTFILE" ]; then exit 1 fi -export PATH="/usr/lib64/openmpi/bin:$PATH" -export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:${LD_LIBRARY_PATH}" +export PATH="/opt/hpcx/ompi/bin/:$PATH" +export LD_LIBRARY_PATH="/opt/hpcx/ompi/lib:${LD_LIBRARY_PATH}" HEAD_NODE=$(head -1 "$HOSTFILE") export HEAD_NODE @@ -53,6 +53,11 @@ function mpi_params { } # # shellcheck disable=SC2086 +echo "====================ENV clx01 ------------" +env +whereis mpirun +echo " ========== ENV clx02" +ssh swx-clx02 env mpirun $(mpi_params 1) hostname diff --git a/test/gtest/core/test_mc_reduce.cc b/test/gtest/core/test_mc_reduce.cc index e528119835..674808ccdb 100644 --- a/test/gtest/core/test_mc_reduce.cc +++ b/test/gtest/core/test_mc_reduce.cc @@ -101,6 +101,7 @@ class test_mc_reduce : public testing::Test { std::cerr << "failed to destory cuda stream" << std::endl; return UCC_ERR_NO_MESSAGE; } + ee_context = NULL; } #endif return status; @@ -110,11 +111,11 @@ class test_mc_reduce : public testing::Test { { ucc_status_t status; - status = alloc_executor(mtype); + status = alloc_bufs(mtype, n); if (UCC_OK != status) { return status; } - return alloc_bufs(mtype, n); + return alloc_executor(mtype); } ucc_status_t alloc_bufs(ucc_memory_type_t mtype, size_t n) @@ -192,9 +193,6 @@ class test_mc_reduce : public testing::Test { virtual void TearDown() override { free_bufs(mem_type); - if (executor) { - free_executor(); - } ucc_mc_finalize(); } @@ -246,6 +244,9 @@ class test_mc_reduce : public testing::Test { GTEST_SKIP(); } ASSERT_EQ(status, UCC_OK); + if (executor) { + free_executor(); + } if (mt != UCC_MEMORY_TYPE_HOST) { ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d), @@ -272,6 +273,9 @@ class test_mc_reduce : public testing::Test { GTEST_SKIP(); } ASSERT_EQ(status, UCC_OK); + if (executor) { + free_executor(); + } if (mt != UCC_MEMORY_TYPE_HOST) { ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d), @@ -305,6 +309,9 @@ class test_mc_reduce : public testing::Test { GTEST_SKIP(); } ASSERT_EQ(status, UCC_OK); + if (executor) { + free_executor(); + } if (mt != UCC_MEMORY_TYPE_HOST) { ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d),