diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cd147a24..bc0ae922 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,6 +13,12 @@ on: - ".github/workflows/composite-cmake/**" pull_request: +env: + # Due to busy file system problems on the CI runners, we introduced a patch to + # sleep if the MPI count of an I/O operation is unexpectedly 0 and then retry + # the I/O operation once. The time is in milliseconds. + SC_IO_SLEEP_TIME: 500 + jobs: linux-multi: @@ -29,6 +35,9 @@ jobs: - name: Checkout source code uses: actions/checkout@main + - name: Patch I/O for CI by sleeping if I/O intensity is too high + run: git apply doc/patch/patch-CI-IO.patch + - name: Run bootstrap script run: ./bootstrap @@ -37,7 +46,8 @@ jobs: run: | DIR="checkMPIdebug_shared" && mkdir -p "$DIR" && cd "$DIR" ../configure --enable-mpi --enable-debug --disable-shared \ - CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" + CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" \ + CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" make -j V=0 make -j check V=0 @@ -45,7 +55,8 @@ jobs: shell: bash run: | DIR="checkMPI" && mkdir -p "$DIR" && cd "$DIR" - ../configure --enable-mpi CFLAGS="-O2" + ../configure --enable-mpi CFLAGS="-O2" \ + CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" make -j V=0 make -j check V=0 @@ -53,7 +64,8 @@ jobs: shell: bash run: | DIR="checkdebug" && mkdir -p "$DIR" && cd "$DIR" - ../configure --enable-debug CFLAGS="-O0 -g -Wall -Wno-uninitialized" + ../configure --enable-debug CFLAGS="-O0 -g -Wall -Wno-uninitialized" \ + CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" make -j V=0 make -j check V=0 @@ -66,7 +78,8 @@ jobs: shell: bash run: | DIR="checkMPIdebugCXX" && mkdir -p "$DIR" && cd "$DIR" - ../configure --enable-mpi --enable-debug CFLAGS="-O0" CC=mpicxx + ../configure --enable-mpi --enable-debug CFLAGS="-O0" CC=mpicxx \ + CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" make -j V=0 make -j check V=0 @@ -75,7 +88,8 @@ jobs: run: | DIR="checkOpenMPMPIdebug" && mkdir -p "$DIR" && cd "$DIR" ../configure --enable-openmp="-fopenmp" --enable-mpi \ - --enable-debug CFLAGS="-O0" + --enable-debug CFLAGS="-O0" \ + CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" make -j V=0 make -j check V=0 @@ -83,7 +97,7 @@ jobs: shell: bash run: | DIR="distcheck" && mkdir -p "$DIR" && cd "$DIR" - ../configure + ../configure CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" make -j distcheck V=0 - name: Upload log files @@ -110,6 +124,9 @@ jobs: - name: Checkout source code uses: actions/checkout@main + - name: Patch I/O for CI by sleeping if I/O intensity is too high + run: git apply doc/patch/patch-CI-IO.patch + - name: Run bootstrap script run: ./bootstrap @@ -119,7 +136,8 @@ jobs: DIR="checkMPIdebug_valgrind" && mkdir -p "$DIR" && cd "$DIR" ../configure --enable-mpi --enable-debug \ --disable-shared --enable-valgrind \ - CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" + CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" \ + CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" make -j V=0 make -j check V=0 @@ -149,6 +167,9 @@ jobs: with: fetch-depth: 0 + - name: Patch I/O for CI by sleeping if I/O intensity is too high + run: git apply doc/patch/patch-CI-IO.patch + - name: Identify version shell: bash run: | @@ -165,7 +186,8 @@ jobs: ../configure --enable-mpi --enable-debug \ CFLAGS="-O0 -g -pedantic -Wall -Wextra -Werror \ -Wno-unused-parameter -Wno-builtin-declaration-mismatch \ - -Wno-implicit-fallthrough" + -Wno-implicit-fallthrough" \ + CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" make -j V=0 make -j check V=0 make -j distcheck V=0 diff --git a/.github/workflows/ci_cmake.yml b/.github/workflows/ci_cmake.yml index 733e46c2..ee1e9bf5 100644 --- a/.github/workflows/ci_cmake.yml +++ b/.github/workflows/ci_cmake.yml @@ -20,6 +20,10 @@ env: CTEST_PARALLEL_LEVEL: 0 CMAKE_INSTALL_PREFIX: ~/local CMAKE_PREFIX_PATH: ~/local + # Due to busy file system problems on the CI runners, we introduced a patch to + # sleep if the MPI count of an I/O operation is unexpectedly 0 and then retry + # the I/O operation once. The time is in milliseconds. + SC_IO_SLEEP_TIME: 500 jobs: @@ -47,6 +51,9 @@ jobs: - uses: actions/checkout@v4 name: Checkout source code + - name: Patch I/O for CI by sleeping if I/O intensity is too high + run: git apply doc/patch/patch-CI-IO.patch + - name: Install system dependencies if: ${{ matrix.mpi }} run: | @@ -76,6 +83,9 @@ jobs: - uses: actions/checkout@v4 name: Checkout source code + - name: Patch I/O for CI by sleeping if I/O intensity is too high + run: git apply doc/patch/patch-CI-IO.patch + - name: Install system dependencies if: ${{ matrix.mpi }} run: | @@ -112,6 +122,14 @@ jobs: - name: Checkout source code uses: actions/checkout@v4 + - name: Patch I/O for CI by sleeping if I/O intensity is too high + run: git apply doc/patch/patch-CI-IO.patch + + - name: Set macro for I/O waiting + shell: bash + # use an enviroment variable to prepend the flag to the bulit-in compiler flags + run: echo "CFLAGS=-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" >> "$GITHUB_ENV" + - name: CMake configure run: cmake --preset default -DSC_ENABLE_MPI:BOOL=${{ matrix.mpi }} -DSC_TEST_WITH_VALGRIND:BOOL=${{ matrix.valgrind }} @@ -154,6 +172,9 @@ jobs: - uses: actions/checkout@v4 name: Checkout source code + - name: Patch I/O for CI by sleeping if I/O intensity is too high + run: git apply doc/patch/patch-CI-IO.patch + - name: Install system dependencies run: brew install open-mpi @@ -177,6 +198,9 @@ jobs: - uses: actions/checkout@v4 name: Checkout source code + - name: Patch I/O for CI by sleeping if I/O intensity is too high + run: git apply doc/patch/patch-CI-IO.patch + - run: echo "CMAKE_INSTALL_PREFIX=$HOME/local" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append - run: echo "CMAKE_PREFIX_PATH=$HOME/local" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append diff --git a/.github/workflows/ci_darwin.yml b/.github/workflows/ci_darwin.yml index 7d586ed9..fe0944cb 100644 --- a/.github/workflows/ci_darwin.yml +++ b/.github/workflows/ci_darwin.yml @@ -13,6 +13,12 @@ on: - ".github/workflows/composite-cmake/**" pull_request: +env: + # Due to busy file system problems on the CI runners, we introduced a patch to + # sleep if the MPI count of an I/O operation is unexpectedly 0 and then retry + # the I/O operation once. The time is in milliseconds. + SC_IO_SLEEP_TIME: 500 + jobs: darwin: runs-on: macos-latest @@ -26,6 +32,9 @@ jobs: - uses: actions/checkout@main name: Checkout source code + - name: Patch I/O for CI by sleeping if I/O intensity is too high + run: git apply doc/patch/patch-CI-IO.patch + - name: Install system dependencies run: brew install open-mpi libtool automake @@ -36,7 +45,8 @@ jobs: run: | DIR="checkdebug" && mkdir -p "$DIR" && cd "$DIR" ../configure --enable-debug \ - CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" + CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" \ + CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" make -j V=0 make -j check V=0 @@ -44,7 +54,8 @@ jobs: run: | DIR="checkMPIdebug" && mkdir -p "$DIR" && cd "$DIR" ../configure --enable-mpi --enable-debug \ - CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" + CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" \ + CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" make -j V=0 make -j check V=0 @@ -52,7 +63,8 @@ jobs: run: | DIR="checkMPIdebugCXX" && mkdir -p "$DIR" && cd "$DIR" ../configure --enable-mpi --enable-debug CC=mpicxx \ - CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" + CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" \ + CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" make -j V=0 make -j check V=0 diff --git a/.github/workflows/composite-cmake/action.yml b/.github/workflows/composite-cmake/action.yml index 4652e7a6..0e471cf5 100644 --- a/.github/workflows/composite-cmake/action.yml +++ b/.github/workflows/composite-cmake/action.yml @@ -2,6 +2,10 @@ runs: using: "composite" steps: + - name: Set macro for I/O waiting + shell: bash + # use an enviroment variable to prepend the flag to the bulit-in compiler flags + run: echo "CFLAGS=-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" >> "$GITHUB_ENV" - name: CMake configure shell: bash run: >- diff --git a/doc/patch/patch-CI-IO.patch b/doc/patch/patch-CI-IO.patch new file mode 100644 index 00000000..fd502957 --- /dev/null +++ b/doc/patch/patch-CI-IO.patch @@ -0,0 +1,118 @@ +From 3860abed348e7157ce8fd6518729265c5c21ee34 Mon Sep 17 00:00:00 2001 +From: Tim Griesbach +Date: Wed, 15 Jan 2025 15:23:29 +0100 +Subject: [PATCH] Implement sleep and retry in case of busy file system + +--- + src/sc_io.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 63 insertions(+), 4 deletions(-) + +diff --git a/src/sc_io.c b/src/sc_io.c +index 4cfc7b81..904fc573 100644 +--- a/src/sc_io.c ++++ b/src/sc_io.c +@@ -37,6 +37,34 @@ + #include + #endif + ++/** In case of an unexpected retrieved count 0, this macro sleeps and retries. ++ * In the Github Actions CI, we experienced transient CI failures due to the ++ * file system not performing requested I/O operations. ++ * ++ * The macro can be only callled in the case that expected count is unequal 0. ++ * It checks if count is 0 and if this is true, it sleeps and then retries the ++ * I/O operation once. ++ */ ++#define SC_IO_SLEEP_AND_RETRY(func, time) do {\ ++ retval = sc_io_error_class (sc_MPI_SUCCESS,\ ++ &errcode);\ ++ SC_CHECK_MPI (retval);\ ++ if (*ocount == 0) {\ ++ sc_sleep (time);\ ++ mpiret = func (mpifile, offset,\ ++ (void *) ptr, count, t,\ ++ &mpistatus);\ ++ if (mpiret == sc_MPI_SUCCESS) {\ ++ retval = sc_MPI_Get_count (&mpistatus,\ ++ t, ocount);\ ++ SC_CHECK_MPI (retval);\ ++ }\ ++ retval = sc_io_error_class (mpiret,\ ++ &errcode);\ ++ SC_CHECK_MPI (retval);\ ++ }} while (0) ++ ++ + sc_io_sink_t * + sc_io_sink_new (int iotype, int iomode, int ioencode, ...) + { +@@ -1723,7 +1751,15 @@ sc_io_read_at (sc_MPI_File mpifile, sc_MPI_Offset offset, void *ptr, + /* working around 0 count not working for some implementations */ + mpiret = sc_MPI_Get_count (&mpistatus, t, ocount); + SC_CHECK_MPI (mpiret); +- return sc_MPI_SUCCESS; ++ ++ /* This macro sleeps and tries the I/O operation again if the requested I/O ++ * operation was not performed by the file system. ++ * The macro SC_HACK_MPI_SLEEP is expected to be defined during ++ * configuration. ++ */ ++ SC_IO_SLEEP_AND_RETRY (MPI_File_read_at, SC_HACK_MPI_SLEEP); ++ ++ return errcode; + } + retval = sc_io_error_class (mpiret, &errcode); + SC_CHECK_MPI (retval); +@@ -1806,7 +1842,14 @@ sc_io_read_at_all (sc_MPI_File mpifile, sc_MPI_Offset offset, void *ptr, + mpiret = sc_MPI_Get_count (&mpistatus, t, ocount); + SC_CHECK_MPI (mpiret); + +- return sc_MPI_SUCCESS; ++ /* This macro sleeps and tries the I/O operation again if the requested I/O ++ * operation was not performed by the file system. ++ * The macro SC_HACK_MPI_SLEEP is expected to be defined during ++ * configuration. ++ */ ++ SC_IO_SLEEP_AND_RETRY (MPI_File_read_at_all, SC_HACK_MPI_SLEEP); ++ ++ return errcode; + } + + retval = sc_io_error_class (mpiret, &errcode); +@@ -2008,7 +2051,15 @@ sc_io_write_at (sc_MPI_File mpifile, sc_MPI_Offset offset, + /* working around 0 count not working for some implementations */ + mpiret = sc_MPI_Get_count (&mpistatus, t, ocount); + SC_CHECK_MPI (mpiret); +- return sc_MPI_SUCCESS; ++ ++ /* This macro sleeps and tries the I/O operation again if the requested I/O ++ * operation was not performed by the file system. ++ * The macro SC_HACK_MPI_SLEEP is expected to be defined during ++ * configuration. ++ */ ++ SC_IO_SLEEP_AND_RETRY (MPI_File_write_at, SC_HACK_MPI_SLEEP); ++ ++ return errcode; + } + retval = sc_io_error_class (mpiret, &errcode); + SC_CHECK_MPI (retval); +@@ -2091,7 +2142,15 @@ sc_io_write_at_all (sc_MPI_File mpifile, sc_MPI_Offset offset, + /* working around 0 count not working for some implementations */ + mpiret = sc_MPI_Get_count (&mpistatus, t, ocount); + SC_CHECK_MPI (mpiret); +- return sc_MPI_SUCCESS; ++ ++ /* This macro sleeps and tries the I/O operation again if the requested I/O ++ * operation was not performed by the file system. ++ * The macro SC_HACK_MPI_SLEEP is expected to be defined during ++ * configuration. ++ */ ++ SC_IO_SLEEP_AND_RETRY (MPI_File_write_at_all, SC_HACK_MPI_SLEEP); ++ ++ return errcode; + } + + retval = sc_io_error_class (mpiret, &errcode); +-- +2.30.2 + diff --git a/src/sc.c b/src/sc.c index ddf994d1..a30f9bb8 100644 --- a/src/sc.c +++ b/src/sc.c @@ -48,6 +48,12 @@ typedef void (*sc_sig_t) (int); #include #endif +#if _POSIX_C_SOURCE >= 199309L +#include +#else +#include +#endif + typedef struct sc_package { int is_registered; @@ -1626,3 +1632,25 @@ sc_have_json (void) return 1; #endif } + +void +sc_sleep (unsigned milliseconds){ +#if _POSIX_C_SOURCE >= 199309L + struct timespec ts; + /* full seconds */ + ts.tv_sec = milliseconds / 1000; + /* nanoseconds */ + ts.tv_nsec = (milliseconds % 1000) * 1000000; + nanosleep (&ts, NULL); +#elif defined(_POSIX_C_SOURCE) + /* older POSIX */ + if (milliseconds >= 1000) { + sleep (milliseconds / 1000); + } + usleep ((milliseconds % 1000) * 1000); +#elif _MSC_VER + Sleep (milliseconds); +#else + SC_ABORT ("No suitable sleep function available."); +#endif +} diff --git a/src/sc.h b/src/sc.h index 17991f3f..e550807f 100644 --- a/src/sc.h +++ b/src/sc.h @@ -878,6 +878,12 @@ int sc_have_zlib (void); */ int sc_have_json (void); +/** Portable function to sleep a prescribed amount of milliseconds. + * + * \param [in] milliseconds The number of milliseconds to sleep. + */ +void sc_sleep (unsigned milliseconds); + SC_EXTERN_C_END; #endif /* SC_H */