From 727000cefcc6eab8a153b3a33c63c2ed055b9b3c Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky <1682574+reidpr@users.noreply.github.com> Date: Mon, 26 Feb 2024 12:14:30 -0700 Subject: [PATCH 01/16] PR #1845: rm pkg-config dependency at autogen.sh time --- configure.ac | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/configure.ac b/configure.ac index 34625b00d..3ca364e80 100644 --- a/configure.ac +++ b/configure.ac @@ -63,12 +63,6 @@ MKDIR_P=${MKDIR_P:-install -d -m 0755} AM_INIT_AUTOMAKE([1.13 -Wall -Werror foreign subdir-objects]) -# Check for “pkg-config”. It’s here because we use PKG_CHECK_MODULES -# conditionally later and we want to make sure this always happens [1, §3.4]. -# -# [1]: https://autotools.info/pkgconfig/pkg_check_modules.html -PKG_PROG_PKG_CONFIG - AC_CONFIG_HEADERS([bin/config.h]) AC_CONFIG_FILES([Makefile bin/Makefile @@ -491,17 +485,22 @@ have_libfuse3=n/a have_libsquashfuse_ll=n/a have_ll_h=n/a AS_IF([test $want_libsquashfuse = yes], [ - # libfuse3. Must use pkg-config because as of version 0.5.0 SquashFUSE’s - # ll.h won’t build without an appropriate -I [1]. This macro defines some - # variables that we use here; see this third-party documentation [2]. (I - # could not find first-party docs for it.) + # libfuse3. As of version 0.5.0, SquashFUSE’s ll.h won’t build without an + # appropriate -I [1]. Presently we use pkg-config to find it, but see #1844. + # + # We avoid PKG_CHECK_MODULES because it introduces a dependency on + # pkg-config at autogen.sh time, with impressively incomprehensible error + # messages if it’s not met [2]. The approach below also seems simpler [3]? # # [1]: https://github.com/vasi/squashfuse/commit/eca5764 - # [2]: https://autotools.info/pkgconfig/pkg_check_modules.html - PKG_CHECK_MODULES([fuse3], [fuse3], [ - # libfuse3 found + # [2]: https://ae1020.github.io/undefined-macro-pkg-config/ + # [3]: https://tirania.org/blog/archive/2012/Oct-20.html + AC_CHECK_PROG(have_pkg_config, pkg-config, yes, no) + AS_IF([test $have_pkg_config != yes], + [AC_MSG_ERROR([need pkg-config to find libfuse3; try --with-libsquashfuse=no or see issue @%:@1844])]) + AS_IF([pkg-config --exists fuse3], [ have_libfuse3=yes - CFLAGS="$CFLAGS $fuse3_CFLAGS" + CFLAGS="$CFLAGS $(pkg-config --cflags fuse3)" # libsquashfuse? AC_CHECK_LIB([squashfuse_ll], [sqfs_ll_mount], [have_libsquashfuse_ll=yes], From 4471fba6a422f0f595da1bf211b4f72b12707b2f Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 29 Feb 2024 11:47:51 -0700 Subject: [PATCH 02/16] version 0.37 --- VERSION | 2 +- doc/_loc.rst | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/VERSION b/VERSION index 2dd5e4a04..c128d4d9e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.37~pre +0.37 diff --git a/doc/_loc.rst b/doc/_loc.rst index 0e6172844..e4afa0768 100644 --- a/doc/_loc.rst +++ b/doc/_loc.rst @@ -1,24 +1,24 @@ .. Do not edit this file — it’s auto-generated. We pride ourselves on keeping Charliecloud lightweight and simple. The lines -of code as of version 0.36 is: +of code as of version 0.37 is: .. list-table:: * - Program itself - - 8924 + - 9079 * - Test suite & examples - - 11941 + - 12019 * - Documentation - - 6311 + - 6416 * - Build system - - 1289 + - 1294 * - Packaging - - 628 + - 629 * - Miscellaneous - 506 * - Total - - 29599 + - 29943 These include code only, excluding blank lines and comments. They were counted using `cloc `_ version 1.96. From 9871c3022c1d42b4956c40b9c6647aa5ff8b718f Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 29 Feb 2024 11:48:24 -0700 Subject: [PATCH 03/16] onward to 0.38 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index c128d4d9e..d3a551e7c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.37 +0.38~pre From c2bcd802a43f9618ff14db4a0567a3ec9e3df558 Mon Sep 17 00:00:00 2001 From: Vladimir Fokow <57260995+VladimirFokow@users.noreply.github.com> Date: Wed, 6 Mar 2024 00:18:54 +0100 Subject: [PATCH 04/16] PR #1853: tutorial: fix missing hello.py in output --- doc/tutorial.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index a75d088b7..78343f656 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -426,8 +426,8 @@ Let’s pull that image and see how it looks:: hello.2 $ ch-convert hello.2 ./hello.2 $ ls ./hello.2 - bin etc lib mnt proc run srv tmp var - dev home media opt root sbin sys usr + bin ch dev etc hello.py home lib lib64 media mnt + opt proc root run sbin srv sys tmp usr var MPI Hello World From 5c1a2ba4c707ca1709a789f87c4c6679ef992585 Mon Sep 17 00:00:00 2001 From: Peter Wienemann Date: Mon, 11 Mar 2024 18:36:25 +0100 Subject: [PATCH 05/16] PR #1856: doctest-auto: Ensure locale-independent output --- test/doctest-auto | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/doctest-auto b/test/doctest-auto index 9b736591c..e860adeda 100755 --- a/test/doctest-auto +++ b/test/doctest-auto @@ -4,6 +4,9 @@ set -e -o pipefail +# Ensure reproducible output (#1849) +export LC_ALL=C + cat < Date: Tue, 19 Mar 2024 09:31:11 -0600 Subject: [PATCH 06/16] PR #1863: --break: nice error if stdin is not a TTY --- bin/ch-image.py.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/ch-image.py.in b/bin/ch-image.py.in index 166cb21ef..4871c90b9 100644 --- a/bin/ch-image.py.in +++ b/bin/ch-image.py.in @@ -415,6 +415,8 @@ if (__name__ == "__main__"): for (opt, arg) in zip(sys.argv[1:], sys.argv[2:] + [None]): (opt, _, arg_eq) = opt.partition("=") if (opt == "--break"): + if (not sys.stdin.isatty()): + ch.FATAL("--break: standard input must be a terminal") if (arg_eq != ""): arg = arg_eq try: From 5ceefa7040c0accb91acf8b2c99e0f7ec6a97429 Mon Sep 17 00:00:00 2001 From: Lucas Caudill Date: Wed, 20 Mar 2024 16:20:59 -0600 Subject: [PATCH 07/16] PR #1864: retire storage versions 4 and 5 --- lib/filesystem.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/lib/filesystem.py b/lib/filesystem.py index f6336d460..21c26aa79 100644 --- a/lib/filesystem.py +++ b/lib/filesystem.py @@ -1175,7 +1175,7 @@ def init(self): if (v_found == STORAGE_VERSION): ch.VERBOSE("found storage dir v%d: %s" % (STORAGE_VERSION, self.root)) self.lock() - elif (v_found in {None, 4, 5, 6}): # initialize/upgrade + elif (v_found in {None, 6}): # initialize/upgrade ch.INFO("%s storage directory: v%d %s" % (op, STORAGE_VERSION, self.root)) self.root.mkdir() @@ -1189,23 +1189,6 @@ def init(self): self.unpack_base.mkdir() self.upload_cache.mkdir() if (v_found is not None): # upgrade - if (v_found < 6): - # Git metadata moved from /.git to /ch/.git, and /.gitignore - # went out-of-band (to info/exclude in the repository). - for img in self.unpack_base.iterdir(): - old = img // ".git" - new = img // "ch/git" - if (old.exists()): - new.parent.mkdir() - old.rename(new) - gi = img // ".gitignore" - if (gi.exists()): - gi.unlink() - # Must also remove .gitignore from all commits. This requires - # Git operations, which we can’t do here because the build - # cache may be disabled. Do it in Enabled_Cache.configure(). - if (len(self.build_cache.listdir()) > 0): - self.bucache_needs_ignore_upgrade.file_ensure_exists() if (v_found == 6): # Charliecloud 0.32 had a bug where symlinks to fat manifests # that were really skinny were erroneously absolute, making the From fa4cf7869e02d9a0a72058df1b258b92045b4439 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky <1682574+reidpr@users.noreply.github.com> Date: Wed, 27 Mar 2024 13:11:25 -0600 Subject: [PATCH 08/16] =?UTF-8?q?PR=20#1882:=20fix=20$CFLAGS=20duplication?= =?UTF-8?q?=20that=20we=E2=80=99ve=20had=20since=202019?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/Makefile.am | 3 ++- configure.ac | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/Makefile.am b/bin/Makefile.am index bb60ff3dd..0b2b9a77e 100644 --- a/bin/Makefile.am +++ b/bin/Makefile.am @@ -13,7 +13,8 @@ if HAVE_LIBSQUASHFUSE ch_run_SOURCES += ch_fuse.h ch_fuse.c endif -ch_run_CFLAGS = $(CFLAGS) $(PTHREAD_CFLAGS) +# additional build flags for ch-run +ch_run_CFLAGS = $(PTHREAD_CFLAGS) ch_run_LDADD = $(CH_RUN_LIBS) diff --git a/configure.ac b/configure.ac index 3ca364e80..b3668680e 100644 --- a/configure.ac +++ b/configure.ac @@ -941,7 +941,7 @@ Building Charliecloud test suite ... ${enable_test} required: - C99 compiler ... ${CC} ${CC_VERSION} + C99 compiler ... ${CC} ${CFLAGS} optional: extended glob patterns in --unset-env ... ${have_fnm_extmatch} From 1e1ef743eda53818e68ec79ac9515359a4f0b082 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky <1682574+reidpr@users.noreply.github.com> Date: Thu, 28 Mar 2024 15:00:45 -0600 Subject: [PATCH 09/16] PR #1885: fix ObsPy test --- examples/obspy/Dockerfile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/obspy/Dockerfile b/examples/obspy/Dockerfile index f3d14f769..3e1a29bda 100644 --- a/examples/obspy/Dockerfile +++ b/examples/obspy/Dockerfile @@ -17,8 +17,13 @@ WORKDIR /usr/local/src # # 2. Use latest version so we catch sooner if things explode. # +# 3. ObsPy 1.4.0, the latest as of 2024-03-27, is incompatible with Python +# 3.12 [2], which is recently the default in Miniconda (see PR #1885 and +# issue #1886). +# # [1]: https://docs.anaconda.com/anaconda/user-guide/faq/ -ARG MC_VERSION=latest +# [2]: https://github.com/obspy/obspy/issues/3313#issuecomment-1818165937 +ARG MC_VERSION=py311_24.1.2-0 ARG MC_FILE=Miniconda3-$MC_VERSION-Linux-x86_64.sh RUN wget -nv https://repo.anaconda.com/miniconda/$MC_FILE # Miniconda will fail if the HOME variable is not set. @@ -32,7 +37,6 @@ RUN conda config --set auto_update_conda False # new environment for obspy. # See: https://github.com/obspy/obspy/wiki/Installation-via-Anaconda RUN conda config --add channels conda-forge -# Use numpy 1.21 to avoid isse: https://github.com/obspy/obspy/issues/2940 RUN conda install --yes obspy=1.4.0 RUN conda update obspy From 88e9a71f1fe586332f79f342f3dab850ac025f60 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky <1682574+reidpr@users.noreply.github.com> Date: Thu, 28 Mar 2024 15:01:45 -0600 Subject: [PATCH 10/16] PR #1887: configure: add missed fnctl.h --- configure.ac | 1 + 1 file changed, 1 insertion(+) diff --git a/configure.ac b/configure.ac index b3668680e..87309bb54 100644 --- a/configure.ac +++ b/configure.ac @@ -343,6 +343,7 @@ AC_MSG_RESULT($have_userns) AC_DEFUN([CH_OVERLAY_C], [[ #define _GNU_SOURCE #include + #include #include #include #include From e94d3752a28334ee0468e5ff27d93088b67308ee Mon Sep 17 00:00:00 2001 From: Oliver Freyermuth Date: Sun, 7 Apr 2024 23:01:00 +0200 Subject: [PATCH 11/16] PR #1859: switch SquashFUSE to libfuse3 types The forget operation in libfuse3 takes uint64_t as third parameter, while SquashFUSE defaults to unsigned long as used in libfuse2. This causes a mess on arches with different size of these types, so explicitly switch to the libfuse3 variant. --- bin/ch_fuse.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/ch_fuse.c b/bin/ch_fuse.c index 0ebda0a59..ce60bbcc7 100644 --- a/bin/ch_fuse.c +++ b/bin/ch_fuse.c @@ -27,6 +27,11 @@ // SquashFUSE redefines __le16 unless HAVE_LINUX_TYPES_LE16 is defined. We are // assuming it is defined in on your machine. #define HAVE_LINUX_TYPES_LE16 +// The forget operation in libfuse3 takes uint64_t as third parameter, +// while SquashFUSE defaults to unsigned long as used in libfuse2. +// This causes a mess on arches with different size of these types, +// so explicitly switch to the libfuse3 variant. +#define HAVE_FUSE_LL_FORGET_OP_64T // Now we can include ll.h. #include From 43844da07703b2950bb35e2b6f7c2c7a61bd5ba7 Mon Sep 17 00:00:00 2001 From: Jemma Stachelek Date: Fri, 12 Apr 2024 11:01:30 -0600 Subject: [PATCH 12/16] PR #1890: fix broken spack link --- doc/install.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/install.rst b/doc/install.rst index d3050b66f..934144994 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -224,7 +224,7 @@ package managers. Maintained by us: * `Spack - `_; + `_; install with :code:`+builder` to get :code:`ch-image`. * `Fedora/EPEL `_; check for available versions with :code:`{yum,dnf} list charliecloud`. From 759da04a1ad9bd04691dd7e5657a118151b42218 Mon Sep 17 00:00:00 2001 From: Jordan Ogas Date: Mon, 15 Apr 2024 11:37:58 -0600 Subject: [PATCH 13/16] PR #1883: Spack example: use external `packages.yaml`, add patch --- examples/spack/Dockerfile | 55 ++++++++++++++++++++++++------------ examples/spack/libfuse.patch | 9 ++++++ examples/spack/packages.yaml | 47 ++++++++++++++++++++++++++++++ misc/loc | 2 ++ 4 files changed, 95 insertions(+), 18 deletions(-) create mode 100644 examples/spack/libfuse.patch create mode 100644 examples/spack/packages.yaml diff --git a/examples/spack/Dockerfile b/examples/spack/Dockerfile index c99134cec..473661823 100644 --- a/examples/spack/Dockerfile +++ b/examples/spack/Dockerfile @@ -13,20 +13,29 @@ FROM almalinux:8 # Spack’s m4, and that package is in PowerTools, which we enable using sed(1) # to avoid installing the config-manager DNF plugin. # +# autoconf, git, openssl, pkg-config, python3, fuse3-libs, fuse3-devel, are +# packages that are typically installed on systems. Thus we install them outside +# of Spack and rely them as externals to speed up the build process. +# # [1]: https://spack.readthedocs.io/en/latest/getting_started.html # [2]: https://spack.readthedocs.io/en/latest/workflows.html#using-spack-to-create-docker-images RUN sed -Ei 's/enabled=0/enabled=1/' \ /etc/yum.repos.d/almalinux-powertools.repo RUN dnf install -y --setopt=install_weak_deps=false \ + autoconf \ + automake \ bzip2 \ gcc \ gcc-c++ \ git \ gnupg2-smime \ file \ + fuse3-devel \ + fuse3-libs \ make \ patch \ - python3 \ + pkg-config \ + python38 \ texinfo \ unzip \ which \ @@ -50,28 +59,38 @@ ARG SPACK_REPO=https://github.com/spack/spack RUN git clone $SPACK_REPO && cd spack && git checkout releases/latest # slow RUN cd spack && git status && git rev-parse --short HEAD -# Set up environment to use Spack. (We can’t use setup-env.sh because the -# Dockerfile shell is sh, not Bash.) -ENV PATH /spack/bin:$PATH -RUN spack compiler find --scope system +# Copy our Spack package file; by relying on external packages already installed +# by the container we expedite the spack install process. We do this using +# Spacks config hierarchy, e.g., /etc/spack; however, this file could also be +# placed in the user $HOME/.spack directory. +COPY packages.yaml /etc/spack/ -# Test: Some basic commands. -RUN which spack -RUN spack --version -RUN spack compiler find -RUN spack compiler list -RUN spack compiler list --scope=system -RUN spack compiler list --scope=user -RUN spack compilers -RUN spack spec charliecloud +# Apply a patch that resolves issues with Charliecloud 0.35 finding the +# Squashfuse ll.h header. Remove after https://github.com/spack/spack/pull/43374 +# is merged and included in the latest spack release. +COPY libfuse.patch / +RUN patch -p 0 < libfuse.patch -# Test: Install Charliecloud. +# Test some basic commands and install Charliecloud. # Kludge: here we specify an older python sphinx rtd_theme version because # newer default version, 0.5.0, introduces a dependency on node-js which doesn’t # appear to build on gcc 4.8 or gcc 8.3 # (see: https://github.com/spack/spack/issues/19310). -RUN spack spec charliecloud+docs^py-sphinx-rtd-theme@0.4.3 -RUN spack install charliecloud+docs^py-sphinx-rtd-theme@0.4.3 +RUN source /spack/share/spack/setup-env.sh \ + && spack --version \ + && spack env create ch \ + && spack env activate ch \ + && spack compiler find \ + && spack compiler list --scope=system \ + && spack compiler list --scope=user \ + && spack compilers \ + && spack add charliecloud +docs +squashfuse ^py-sphinx-rtd-theme@0.4.3 \ + && spack concretize --fresh --force \ + && spack env depfile -o Makefile \ + && make -j $(nproc) SPACK_COLOR=always \ + && spack load charliecloud \ + && ch-run --version \ + && ldd $(which ch-run) # Clean up. -RUN spack clean --all +RUN /spack/bin/spack clean --all diff --git a/examples/spack/libfuse.patch b/examples/spack/libfuse.patch new file mode 100644 index 000000000..98f835982 --- /dev/null +++ b/examples/spack/libfuse.patch @@ -0,0 +1,9 @@ +index 0e8f983545..b85ef9958a 100644 +--- spack/var/spack/repos/builtin/packages/charliecloud/package.py ++++ spack/var/spack/repos/builtin/packages/charliecloud/package.py +@@ -152,5 +152,7 @@ def configure_args(self): + if "+squashfuse" in self.spec: + squashfuse_prefix = "{0}".format(self.spec["squashfuse"].prefix) + args.append("--with-libsquashfuse={0}".format(squashfuse_prefix)) ++ fuse_include = self.spec["fuse"].prefix.include.fuse3 ++ args.append("CFLAGS=-I{0}".format(fuse_include)) diff --git a/examples/spack/packages.yaml b/examples/spack/packages.yaml new file mode 100644 index 000000000..7472c8ec7 --- /dev/null +++ b/examples/spack/packages.yaml @@ -0,0 +1,47 @@ +packages: + # The following packages are built externally to speed up the spack build + # process; they can be built from spack without issue, e.g., you can remove + # them from here without issue. + autoconf: + buildable: false + externals: + - spec: autoconf@2.69 + prefix: /usr + automake: + buildable: false + externals: + - spec: automake@1.16.1 + prefix: /usr + git: + buildable: false + externals: + - spec: git@2.39.3 + prefix: /usr + perl: + buildable: false + externals: + - spec: perl@5.26.3 + prefix: /usr + pkgconf: + buildable: false + externals: + - spec: pkgconf@1.4.2 + prefix: /usr + python: + buildable: false + externals: + - spec: python@3.8.17 + prefix: /usr + openssl: + buildable: false + externals: + - spec: openssl@1.1.1 + prefix: /usr + +# Unlike the above, the following packages require a sysadmin. Removing these +# will likely cause issues. + libfuse: + buildable: false + externals: + - spec: libfuse@3.3.0 + prefix: /usr diff --git a/misc/loc b/misc/loc index 77fb83f2f..6bf9026ae 100755 --- a/misc/loc +++ b/misc/loc @@ -243,6 +243,8 @@ find ./.github ./examples ./test -type f -a \( \ -o -path ./test/fixtures/README \ -o -path ./.github/PERUSEME \ -o -path ./examples/chtest/printns \ + -o -path ./examples/spack/packages.yaml \ + -o -path ./examples/spack/libfuse.patch \ -o -path ./test/approved-trailing-whitespace \ -o -path ./test/common.bash \ -o -path ./test/doctest-auto \ From ca75c0124fbcc20d0ff7a47b7a9c6a9dd70166d3 Mon Sep 17 00:00:00 2001 From: Jordan Ogas Date: Mon, 15 Apr 2024 11:42:37 -0600 Subject: [PATCH 14/16] PR #1884: misc/loc: add unknown-files hint --- misc/loc | 1 + 1 file changed, 1 insertion(+) diff --git a/misc/loc b/misc/loc index 6bf9026ae..eed3ce947 100755 --- a/misc/loc +++ b/misc/loc @@ -327,6 +327,7 @@ if [[ -s /tmp/loc.extra ]]; then cat /tmp/loc.extra echo echo '🚨🚨🚨 unknown files found 🚨🚨🚨' + echo 'hint: did you forget to add new file(s) to misc/loc?' exit 1 fi From 7386c358f2b94d9c1fe61339935f6594f66810bb Mon Sep 17 00:00:00 2001 From: Jordan Ogas Date: Wed, 17 Apr 2024 14:46:23 -0600 Subject: [PATCH 15/16] PR #1546: FAQ: add MPI best practices --- doc/best_practices.rst | 152 ++++++++++++++++++++++++++++++++++++++- doc/faq.rst | 2 +- test/build/60_force.bats | 2 +- 3 files changed, 152 insertions(+), 4 deletions(-) diff --git a/doc/best_practices.rst b/doc/best_practices.rst index 429cbb2bf..57cb8fb61 100644 --- a/doc/best_practices.rst +++ b/doc/best_practices.rst @@ -1,6 +1,10 @@ Best practices ************** +.. contents:: + :depth: 3 + :local: + Other best practices information ================================ @@ -303,5 +307,149 @@ building, and then run using a separate container invoked from a different terminal. -.. LocalWords: userguide Gruening Souppaya Morello Scarfone openmpi nist -.. LocalWords: ident OCFS MAGICK +MPI +=== + +Problems that best practices help you avoid +------------------------------------------- + +These recommendations are derived from our experience in mitigating container +MPI issues. It is important to note that, despite marketing claims, no single +container implementation has “solved” MPI or is free of warts; the issues are +numerous, multifaceted, and dynamic. + +Key concepts and related issues include: + + 1. **Workload management**. Running applications on HPC clusters requires + resource management and job scheduling. Put simply, resource management + is the act of allocating and restricting compute resources, e.g., CPU and + memory, whereas job scheduling is the act of prioritizing and enforcing + resource management. *Both require privileged operations.* + + Some privileged container implementations attempt to provide their own + workload management, often referred to as “container orchestration”. + + Charliecloud is lightweight and completely unprivileged. We rely on + existing, reputable and well established HPC workload managers such as + Slurm. + + 2. **Job launch**. When a multi-node MPI job is launched, each node must + launch a number of containerized processes, i.e., *ranks*. Doing this + unprivileged and at scale requires interaction between the application + and workload manager. That is, something like Process Management + Interface (PMI) is needed to facilitate the job launch. + + 3. **Shared memory**. Processes in separate sibling containers cannot use + single-copy *cross-memory attach* (CMA), as opposed to double-copy POSIX + or SysV shared memory. The solution is to put all ranks in the *same* + container with :code:`ch-run --join`. (See above for details: + :ref:`faq_join`.) + + 4. **Network fabric.** Performant MPI jobs must recognize and use a system’s + high-speed interconnect. Common issues that arise are: + + a. Libraries required to use the interconnect are proprietary or + otherwise unavailable to the container. + + b. The interconnect is not supported by the container MPI. + + In both cases, the containerized MPI application will either fail or run + significantly slower. + +These problems can be avoided, and this section describes our recommendations +to do so. + +Recommendations TL;DR +--------------------- + +Generally, we recommend building a flexible MPI container using: + + a. **libfabric** to flexibly manage process communication over a diverse + set of network fabrics; + + b. a parallel **process management interface** (PMI), compatible with the + host workload manager (e.g., PMI2, PMIx, flux-pmi); and + + c. an **MPI** that supports (1) libfabric and (2) the selected PMI. + +More experienced MPI and unprivileged container users can find success through +MPI replacement (injection); however, such practices are beyond the scope of +this FAQ. + +The remaining sections detail the reasoning behind our approach. We recommend +referencing, or directly using, our examples +:code:`examples/Dockerfile.{libfabric,mpich,openmpi}`. + +Use libfabric +------------- + +`libfabric `_ (a.k.a. Open Fabrics +Interfaces or OFI) is a low-level communication library that abstracts diverse +networking technologies. It defines *providers* that implement the mapping +between application-facing software (e.g., MPI) and network specific drivers, +protocols, and hardware. These providers have been co-designed with fabric +hardware and application developers with a focus on HPC needs. libfabric lets +us more easily manage MPI communication over diverse network high-speed +interconnects (a.k.a. *fabrics*). + +From our libfabric example (:code:`examples/Dockerfile.libfabric`): + +.. literalinclude:: ../examples/Dockerfile.libfabric + :language: docker + :lines: 116-135 + +The above compiles libfabric with several “built-in” providers, i.e. +:code:`psm3` (on x86-64), :code:`rxm`, :code:`shm`, :code:`tcp`, and +:code:`verbs`, which enables MPI applications to run efficiently over most +verb devices using TCP, IB, OPA, and RoCE protocols. + +Two key advantages of using libfabric are: (1) the container’s libfabric can +make use of “external” i.e. dynamic-shared-object (DSO) providers, and +(2) libfabric replacement is simpler than MPI replacement and preserves the +original container MPI. That is, managing host/container ABI compatibility is +difficult and error-prone, so we instead manage the more forgiving libfabric +ABI compatibility. + +A DSO provider can be used by a libfabric that did not originally compile it, +i.e., they can be compiled on a target host and later injected into the +container along with any missing shared library dependencies, and used by the +container's libfabric. To build a libfabric provider as a DSO, add :code:`=dl` +to its :code:`configure` argument, e.g., :code:`--with-cxi=dl`. + +A container's libfabric can also be replaced by a host libfabric. This is a +brittle but usually effective way to give containers access to the Cray +libfabric Slingshot provider :code:`cxi`. + +In Charliecloud, both of these injection operations are currently done with +:code:`ch-fromhost`, though see `issue #1861 +`_. + +Choose a compatible PMI +----------------------- + +Unprivileged processes, including unprivileged containerized processes, are +unable to independently launch containerized processes on different nodes, +aside from using SSH, which isn’t scalable. We must either (1) rely on a host +supported parallel process management interface (PMI), or (2) achieve +host/container MPI ABI compatibility through unsavory practices such as +complete container MPI replacement. + +The preferred PMI implementation, e.g., PMI1, PMI2, OpenPMIx, or flux-pmi, +will be that which is best supported by your host workload manager and +container MPI. + +In :code:`example/Dockerfile.libfabric`, we selected :code:`OpenPMIx` because +(1) it is supported by SLURM, OpenMPI, and MPICH, (2)~it is required for +exascale, and (3) OpenMPI versions 5 and newer will no longer support PMI2. + +Choose an MPI compatible with your libfabric and PMI +---------------------------------------------------- + +There are various MPI implementations, e.g., OpenMPI, MPICH, MVAPICH2, +Intel-MPI, etc., to consider. We generally recommend OpenMPI; however, your +MPI implementation of choice will ultimately be that which best supports the +libfabric and PMI most compatible with your hardware and workload manager. + + +.. LocalWords: userguide Gruening Souppaya Morello Scarfone openmpi nist dl +.. LocalWords: ident OCFS MAGICK mpich psm rxm shm DSO pmi MVAPICH diff --git a/doc/faq.rst b/doc/faq.rst index fb6cb16c6..83ba73e8e 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -1364,4 +1364,4 @@ conversion. Important caveats include: .. LocalWords: CAs SY Gutmann AUTH rHsFFqwwqh MrieaQ Za loc mpihello mvo du .. LocalWords: VirtualSize linuxcontainers jour uk lxd rwxr xr qq qqq drwxr -.. LocalWords: drwx +.. LocalWords: drwx mpich diff --git a/test/build/60_force.bats b/test/build/60_force.bats index 2205efb89..81595d0c8 100644 --- a/test/build/60_force.bats +++ b/test/build/60_force.bats @@ -67,7 +67,7 @@ EOF ch-image -v build --force -t tmpimg -f - . <<'EOF' FROM almalinux:8 -RUN curl -sO https://repo.almalinux.org/vault/8.6/BaseOS/x86_64/os/Packages/openssh-8.0p1-13.el8.x86_64.rpm +RUN curl -sSOL https://vault.almalinux.org/8.6/BaseOS/x86_64/os/Packages/openssh-8.0p1-13.el8.x86_64.rpm RUN rpm --install *.rpm EOF } From 649c71096ae3f64fcbee9564c55b04ad26003010 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky <1682574+reidpr@users.noreply.github.com> Date: Mon, 22 Apr 2024 15:35:36 -0600 Subject: [PATCH 16/16] PR #1892: several fixes related to -W testing --- .github/workflows/main.yml | 9 ++++++++ bin/ch-run.c | 12 ++++++++++ bin/ch_core.c | 45 ++++++++++++++++++++++++++++---------- configure.ac | 4 ++++ doc/ch-run.rst | 13 ++++++++--- test/run/ch-run_misc.bats | 7 +++--- 6 files changed, 72 insertions(+), 18 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 596834bb7..5cc5e943c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -47,6 +47,15 @@ jobs: steps: - uses: actions/checkout@v3 + # This allows SSH access to the GitHub Actions VM to debug things that + # only happen on CI. Comment out unless needed. WARNING: tmate.io has + # access to unencrypted SSH traffic. + # See: https://github.com/marketplace/actions/debugging-with-tmate + #- name: set up tmate session + # uses: mxschmitt/action-tmate@v3 + # with: + # detached: true + - name: early setup & validation run: | [[ -n $CH_TEST_BUILDER ]] diff --git a/bin/ch-run.c b/bin/ch-run.c index a2a5e8afe..774f02ed9 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -446,6 +446,12 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) exit(0); #else exit(1); +#endif + } else if (!strcmp(arg, "overlayfs")) { +#ifdef HAVE_OVERLAYFS + exit(0); +#else + exit(1); #endif } else if (!strcmp(arg, "seccomp")) { #ifdef HAVE_SECCOMP @@ -458,6 +464,12 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) exit(0); #else exit(1); +#endif + } else if (!strcmp(arg, "tmpfs-xattrs")) { +#ifdef HAVE_TMPFS_XATTRS + exit(0); +#else + exit(1); #endif } else diff --git a/bin/ch_core.c b/bin/ch_core.c index cd25e2bd0..3850dbfa2 100644 --- a/bin/ch_core.c +++ b/bin/ch_core.c @@ -41,10 +41,27 @@ /* Timeout in seconds for waiting for join semaphore. */ #define JOIN_TIMEOUT 30 -/* Maximum length of paths we're willing to deal with. (Note that +/* Maximum length of paths we’re willing to deal with. (Note that system-defined PATH_MAX isn't reliable.) */ #define PATH_CHARS 4096 +/* Mount point for the tmpfs used by -W. We want this to be (a) always + available [1], (b) short, (c) not used by anything else we care about + during container setup, and (d) not wildly confusing if users see it in an + error message. Must be a string literal because we use C’s literal + concatenation feature. Options considered (all of these required by FHS): + + /boot Not present if host is booted in some strange way? + /etc Likely very reliable but seems risky + /mnt Used for images on GitHub Actions and causes CI failures + /opt Seems very omittable + /srv I’ve never actually seen it used; reliable? + /var Too aggressive? + /var/spool Long; omittable for lightweight hosts? + + [1]: https://www.pathname.com/fhs/pub/fhs-2.3.pdf */ +#define WF_MNT "/srv" + /** Constants **/ @@ -306,26 +323,30 @@ void enter_udss(struct container *c) // https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html // https://www.kernel.org/doc/html/v5.11/filesystems/overlayfs.html if (c->overlay_size != NULL) { - VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size); char *options; + struct stat st; + VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size); T_ (1 <= asprintf(&options, "size=%s", c->overlay_size)); - Zf (mount(NULL, "/mnt", "tmpfs", 0, options), // host should have /mnt + Zf (mount(NULL, WF_MNT, "tmpfs", 0, options), "cannot mount tmpfs for overlay"); free(options); - Z_ (mkdir("/mnt/upper", 0700)); - Z_ (mkdir("/mnt/work", 0700)); - Z_ (mkdir("/mnt/merged", 0700)); - mkdir_scratch = "/mnt/mkdir_overmount"; + Z_ (mkdir(WF_MNT "/upper", 0700)); + Z_ (mkdir(WF_MNT "/work", 0700)); + Z_ (mkdir(WF_MNT "/merged", 0700)); + mkdir_scratch = WF_MNT "/mkdir_overmount"; Z_ (mkdir(mkdir_scratch, 0700)); - T_ (1 <= asprintf(&options, "lowerdir=%s,upperdir=%s,workdir=%s," - "index=on,userxattr,volatile", - c->newroot, "/mnt/upper", "/mnt/work")); + T_ (1 <= asprintf(&options, ("lowerdir=%s,upperdir=%s,workdir=%s," + "index=on,userxattr,volatile"), + c->newroot, WF_MNT "/upper", WF_MNT "/work")); // update newroot - c->newroot = "/mnt/merged"; + Zf (stat(c->newroot, &st), + "can't stat new root; overmounted by tmpfs for -W?: %s", c->newroot); + c->newroot = WF_MNT "/merged"; free(nr_parent); free(nr_base); path_split(c->newroot, &nr_parent, &nr_base); - Zf (mount(NULL, c->newroot, "overlay", 0, options), "can't overlay"); + Zf (mount(NULL, c->newroot, "overlay", 0, options), + "can't overlay: %s, %s", c->newroot, options); VERBOSE("newroot updated: %s", c->newroot); free(options); } diff --git a/configure.ac b/configure.ac index 87309bb54..f05b4cee0 100644 --- a/configure.ac +++ b/configure.ac @@ -787,6 +787,10 @@ AC_SUBST([CH_RUN_LIBS]) AC_SUBST([PYTHON_SHEBANG]) AC_SUBST([SPHINX]) +AS_IF([test $have_overlayfs = yes], + [AC_DEFINE([HAVE_OVERLAYFS], [1], [unprivileged overlayfs])]) +AS_IF([test $have_tmpfs_xattrs = yes], + [AC_DEFINE([HAVE_TMPFS_XATTRS], [1], [tmpfs user xattrs])]) AS_IF([test $have_fnm_extmatch = yes], [AC_DEFINE([HAVE_FNM_EXTMATCH], [1], [extended globs supported])]) AS_IF([test $have_seccomp = yes], diff --git a/doc/ch-run.rst b/doc/ch-run.rst index 7f2d9eebf..2771078e4 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -60,9 +60,16 @@ mounting SquashFS images with FUSE. Don’t expand variables when using :code:`--set-env`. :code:`--feature=FEAT` - If feature :code:`FEAT` is enabled, exit with success. Valid values of - :code:`FEAT` are :code:`extglob` for extended globs, :code:`seccomp` for - :code:`seccomp(2)`, and :code:`squash` for squashfs archives. + If feature :code:`FEAT` is enabled, exit successfully (zero); otherwise, + exit unsuccessfully (non-zero). Note this just communicates the results of + :code:`configure` rather than testing the feature. Valid values of + :code:`FEAT` are: + + * :code:`extglob`: extended globs in :code:`--unset-env` + * :code:`seccomp`: :code:`--seccomp` available + * :code:`squash`: internal SquashFUSE image mounts + * :code:`overlayfs`: unprivileged overlayfs support + * :code:`tmpfs-xattrs`: :code:`user` xattrs on tmpfs :code:`-g`, :code:`--gid=GID` Run as group :code:`GID` within container. diff --git a/test/run/ch-run_misc.bats b/test/run/ch-run_misc.bats index 7c65fec2d..b5f2ad991 100644 --- a/test/run/ch-run_misc.bats +++ b/test/run/ch-run_misc.bats @@ -12,7 +12,7 @@ setup () { demand-overlayfs () { - ch-run -W "$ch_timg" -- true || skip 'no unpriv overlayfs' + ch-run --feature=overlayfs || skip 'no unpriv overlayfs' } @@ -291,10 +291,11 @@ EOF [[ $status -eq 0 ]] # --home - run ch-run --home "$img" -- ls -lah /home + run ch-run --home "$img" -- ls -lAh /home echo "$output" [[ $status -eq 0 ]] - [[ $(echo "$output" | wc -l) -eq 3 ]] + [[ $(echo "$output" | wc -l) -eq 5 ]] # 4 files plus “total” line + [[ $output = *.orig* ]] [[ $output = *directory-in-home* ]] [[ $output = *file-in-home* ]] [[ $output = *"$USER"* ]]