From 557978e74a3bc97c49b87f8642ebdb54267ad8ae Mon Sep 17 00:00:00 2001 From: Andrii Bilokur Date: Mon, 15 Jan 2024 19:27:10 -0500 Subject: [PATCH] CI: Update CUDA, MOFED & CI images (#894) --- .ci/Dockerfile.ngc_pytorch | 20 ++++++ .../Dockerfile.ngc_pytorch.base | 69 +++++++++++++++++++ .ci/job_matrix.yaml | 11 +-- .ci/scripts/env.sh | 4 ++ .ci/scripts/run_docker.sh | 2 +- .ci/scripts/run_tests_ucc_mpi.sh | 3 - 6 files changed, 97 insertions(+), 12 deletions(-) create mode 100644 .ci/Dockerfile.ngc_pytorch create mode 100644 .ci/build_base_docker/Dockerfile.ngc_pytorch.base diff --git a/.ci/Dockerfile.ngc_pytorch b/.ci/Dockerfile.ngc_pytorch new file mode 100644 index 0000000000..91111aa488 --- /dev/null +++ b/.ci/Dockerfile.ngc_pytorch @@ -0,0 +1,20 @@ +ARG CUDA_VER='12.1.1' +FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base +RUN rm -rf ${SRC_DIR}/ucc +COPY . ${SRC_DIR}/ucc + +RUN apt update && apt install -y sudo && \ + echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers +RUN pip install 'protobuf<=3.19.0' +#============================================================================== +# Build UCC +RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh +#============================================================================== +# Install torch_ucc (UCC version) python module and build a wheel package +RUN chown -R 6213:11429 /opt/nvidia +#============================================================================== +RUN groupadd -g 11429 swx-jenkins +RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins +#============================================================================== +USER swx-jenkins + diff --git a/.ci/build_base_docker/Dockerfile.ngc_pytorch.base b/.ci/build_base_docker/Dockerfile.ngc_pytorch.base new file mode 100644 index 0000000000..891e6bc833 --- /dev/null +++ b/.ci/build_base_docker/Dockerfile.ngc_pytorch.base @@ -0,0 +1,69 @@ +ARG CUDA_VER='12.1.1' +FROM nvcr.io/nvidia/pytorch:23.11-py3 +#============================================================================== +ARG NVIDIA_ROOT_DIR=/opt/nvidia +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC +ENV SRC_DIR=${NVIDIA_ROOT_DIR}/src +ENV PKG_DIR=${NVIDIA_ROOT_DIR}/pkg +ENV BIN_DIR=${NVIDIA_ROOT_DIR}/bin +ENV WORKLOADS_DIR=${NVIDIA_ROOT_DIR}/workloads +ENV CUDA_HOME=/usr/local/cuda +ENV UCX_GITHUB_URL=https://github.com/openucx/ucx.git +ENV UCX_BRANCH=master +ENV UCX_BUILD_TYPE=release-mt +ENV UCX_INSTALL_DIR=${BIN_DIR}/ucx/build-${UCX_BUILD_TYPE} +ENV UCC_INSTALL_DIR=${BIN_DIR}/ucc/build +ENV OFED_PKG='lsof kmod udev swig libelf1 libfuse2 pciutils tk gfortran libpci3 libusb-1.0-0 libltdl-dev libmnl0 bison tcl flex chrpath debhelper ethtool graphviz' +ENV PACKAGES='numactl openssh-server protobuf-compiler rdma-core vim libevent-dev build-essential git make autoconf libtool' +ENV OS_VERSION=ubuntu22.04 +ENV PLATFORM=x86_64 +ENV MOFED_VERSION=23.10-0.5.5.0 +ENV MOFED_URL="https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-${OS_VERSION}-${PLATFORM}.tgz" +ENV OMPI_PATH="/opt/hpcx/ompi" +#============================================================================== +RUN apt update && apt install -y ${OFED_PKG} && \ + mkdir -p /tmp/ofed && wget --quiet -O /tmp/ofed/ofed.tgz ${MOFED_URL} && \ + tar -xvf /tmp/ofed/ofed.tgz --strip-components=2 -C /tmp/ofed && \ + /tmp/ofed/mlnxofedinstall --user-space-only --without-fw-update -q --distro ${OS_VERSION} --basic && \ + rm -rf /tmp/ofed + +RUN apt install -y ${PACKAGES} + +# Remove old UCX +RUN rm -rf /opt/hpcx/uc? +ENV PATH=${OMPI_PATH}/bin:$PATH +RUN echo "export PATH=\"\$OMPI_PATH:\$PATH\"" >> /etc/bashrc && \ + export LD_LIBRARY_PATH=\"\$OMPI_PATH/lib64:\${LD_LIBRARY_PATH}\" >> /etc/bashrc +#============================================================================== +# Configure SSH +RUN mkdir -p /var/run/sshd && \ + cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \ + ssh-keygen -A && \ + rm -f /run/nologin +#============================================================================== + +#============================================================================== +RUN mkdir -p ${SRC_DIR} ${PKG_DIR} ${BIN_DIR} ${WORKLOADS_DIR} && \ + cd ${SRC_DIR} && \ + mkdir -p ${SRC_DIR}/ucx && \ + git clone --recursive ${UCX_GITHUB_URL} ${SRC_DIR}/ucx && \ + cd ${SRC_DIR}/ucx && \ + git checkout ${UCX_BRANCH} + +COPY . ${SRC_DIR}/ucc +#============================================================================== +# Build UCX +RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucx.sh +ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH} +#============================================================================== +# Install workloads +WORKDIR ${WORKLOADS_DIR} +RUN git clone https://github.com/facebookresearch/dlrm.git && \ + cd ${WORKLOADS_DIR}/dlrm && \ + pip3 install -r ${WORKLOADS_DIR}/dlrm/requirements.txt && \ + pip3 install tensorboard +RUN git clone https://github.com/facebookresearch/param.git && \ + pip3 install -r ${WORKLOADS_DIR}/param/requirements.txt diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml index af23b10578..da1f4f65c5 100644 --- a/.ci/job_matrix.yaml +++ b/.ci/job_matrix.yaml @@ -20,7 +20,7 @@ volumes: } env: - CUDA_VER: '11.4.2' + CUDA_VER: '12.1.1' UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}" UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}" NVIDIA_ROOT_DIR: "/opt/nvidia" @@ -42,8 +42,8 @@ credentials: runs_on_dockers: - { - file: ".ci/Dockerfile.centos8", - name: "centos8", + file: ".ci/Dockerfile.ngc_pytorch", + name: "ngc_pytorch", tag: "${BUILD_NUMBER}", arch: "x86_64", uri: "${UCC_URI_SUFFIX}", @@ -69,7 +69,6 @@ steps: docker pull ${DOCKER_IMAGE_NAME} docker create -ti --rm $DOCKER_OPT ${DOCKER_IMAGE_NAME} /bin/bash > ${WORKSPACE}/ucc_docker.id docker start $(cat ${WORKSPACE}/ucc_docker.id) - #============================================================================ - name: Run Coverity credentialsId: "bc9a18d3-1153-449c-b924-7fc9249c9cc0" @@ -80,7 +79,6 @@ steps: echo "Running coverity" ${WORKSPACE}/.ci/scripts/coverity.sh archiveArtifacts: .ci/scripts/cov-build/* - #============================================================================ - name: Run UCC / Torch-UCC tests agentSelector: "{nodeLabel: 'swx-clx01'}" @@ -88,9 +86,6 @@ steps: echo "INFO: Run UCC tests" hostname docker exec $(cat ${WORKSPACE}/ucc_docker.id) bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_ucc.sh" - - echo "INFO: Run Torch-UCC tests (UCC)" - docker exec $(cat ${WORKSPACE}/ucc_docker.id) bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_torch_ucc.sh" always: | docker rm --force $(cat ${WORKSPACE}/ucc_docker.id) #============================================================================ diff --git a/.ci/scripts/env.sh b/.ci/scripts/env.sh index 649acaa53d..b5fc5da29a 100755 --- a/.ci/scripts/env.sh +++ b/.ci/scripts/env.sh @@ -1,5 +1,9 @@ #!/bin/bash -eEx +export PATH="/opt/hpcx/ompi/bin:$PATH" +export LD_LIBRARY_PATH="/opt/hpcx/ompi/lib:${LD_LIBRARY_PATH}" +export OPAL_PREFIX=/opt/hpcx/ompi + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)" # shellcheck disable=SC2034 diff --git a/.ci/scripts/run_docker.sh b/.ci/scripts/run_docker.sh index 7f141d65c9..9535298bb2 100755 --- a/.ci/scripts/run_docker.sh +++ b/.ci/scripts/run_docker.sh @@ -45,7 +45,7 @@ DOCKER_RUN_ARGS="\ -d \ --rm \ --name=${DOCKER_CONTAINER_NAME} \ --v /labhome:/labhome \ +-v /labhome/swx-jenkins:/labhome/swx-jenkins \ " # shellcheck disable=SC2013 diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 4701a7c04e..73a4eaca6a 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -15,9 +15,6 @@ if [ -z "$HOSTFILE" ]; then exit 1 fi -export PATH="/usr/lib64/openmpi/bin:$PATH" -export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:${LD_LIBRARY_PATH}" - HEAD_NODE=$(head -1 "$HOSTFILE") export HEAD_NODE export MASTER_ADDR=${HEAD_NODE}