Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch transient CI failures due to busy file system #214

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
38 changes: 30 additions & 8 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ on:
- ".github/workflows/composite-cmake/**"
pull_request:

env:
# Due to busy file system problems on the CI runners, we introduced a patch to
# sleep if the MPI count of an I/O operation is unexpectedly 0 and then retry
# the I/O operation once. The time is in milliseconds.
SC_IO_SLEEP_TIME: 500

jobs:

linux-multi:
Expand All @@ -29,6 +35,9 @@ jobs:
- name: Checkout source code
uses: actions/checkout@main

- name: Patch I/O for CI by sleeping if I/O intensity is too high
run: git apply doc/patch/patch-CI-IO.patch

- name: Run bootstrap script
run: ./bootstrap

Expand All @@ -37,23 +46,26 @@ jobs:
run: |
DIR="checkMPIdebug_shared" && mkdir -p "$DIR" && cd "$DIR"
../configure --enable-mpi --enable-debug --disable-shared \
CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter"
CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" \
CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME"
make -j V=0
make -j check V=0

- name: Make check with MPI, without debug
shell: bash
run: |
DIR="checkMPI" && mkdir -p "$DIR" && cd "$DIR"
../configure --enable-mpi CFLAGS="-O2"
../configure --enable-mpi CFLAGS="-O2" \
CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME"
make -j V=0
make -j check V=0

- name: Make check without MPI, with debug
shell: bash
run: |
DIR="checkdebug" && mkdir -p "$DIR" && cd "$DIR"
../configure --enable-debug CFLAGS="-O0 -g -Wall -Wno-uninitialized"
../configure --enable-debug CFLAGS="-O0 -g -Wall -Wno-uninitialized" \
CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME"
make -j V=0
make -j check V=0

Expand All @@ -66,7 +78,8 @@ jobs:
shell: bash
run: |
DIR="checkMPIdebugCXX" && mkdir -p "$DIR" && cd "$DIR"
../configure --enable-mpi --enable-debug CFLAGS="-O0" CC=mpicxx
../configure --enable-mpi --enable-debug CFLAGS="-O0" CC=mpicxx \
CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME"
make -j V=0
make -j check V=0

Expand All @@ -75,15 +88,16 @@ jobs:
run: |
DIR="checkOpenMPMPIdebug" && mkdir -p "$DIR" && cd "$DIR"
../configure --enable-openmp="-fopenmp" --enable-mpi \
--enable-debug CFLAGS="-O0"
--enable-debug CFLAGS="-O0" \
CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME"
make -j V=0
make -j check V=0

- name: Make distcheck
shell: bash
run: |
DIR="distcheck" && mkdir -p "$DIR" && cd "$DIR"
../configure
../configure CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME"
make -j distcheck V=0

- name: Upload log files
Expand All @@ -110,6 +124,9 @@ jobs:
- name: Checkout source code
uses: actions/checkout@main

- name: Patch I/O for CI by sleeping if I/O intensity is too high
run: git apply doc/patch/patch-CI-IO.patch

- name: Run bootstrap script
run: ./bootstrap

Expand All @@ -119,7 +136,8 @@ jobs:
DIR="checkMPIdebug_valgrind" && mkdir -p "$DIR" && cd "$DIR"
../configure --enable-mpi --enable-debug \
--disable-shared --enable-valgrind \
CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter"
CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" \
CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME"
make -j V=0
make -j check V=0

Expand Down Expand Up @@ -149,6 +167,9 @@ jobs:
with:
fetch-depth: 0

- name: Patch I/O for CI by sleeping if I/O intensity is too high
run: git apply doc/patch/patch-CI-IO.patch

- name: Identify version
shell: bash
run: |
Expand All @@ -165,7 +186,8 @@ jobs:
../configure --enable-mpi --enable-debug \
CFLAGS="-O0 -g -pedantic -Wall -Wextra -Werror \
-Wno-unused-parameter -Wno-builtin-declaration-mismatch \
-Wno-implicit-fallthrough"
-Wno-implicit-fallthrough" \
CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME"
make -j V=0
make -j check V=0
make -j distcheck V=0
Expand Down
24 changes: 24 additions & 0 deletions .github/workflows/ci_cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ env:
CTEST_PARALLEL_LEVEL: 0
CMAKE_INSTALL_PREFIX: ~/local
CMAKE_PREFIX_PATH: ~/local
# Due to busy file system problems on the CI runners, we introduced a patch to
# sleep if the MPI count of an I/O operation is unexpectedly 0 and then retry
# the I/O operation once. The time is in milliseconds.
SC_IO_SLEEP_TIME: 500

jobs:

Expand Down Expand Up @@ -47,6 +51,9 @@ jobs:
- uses: actions/checkout@v4
name: Checkout source code

- name: Patch I/O for CI by sleeping if I/O intensity is too high
run: git apply doc/patch/patch-CI-IO.patch

- name: Install system dependencies
if: ${{ matrix.mpi }}
run: |
Expand Down Expand Up @@ -76,6 +83,9 @@ jobs:
- uses: actions/checkout@v4
name: Checkout source code

- name: Patch I/O for CI by sleeping if I/O intensity is too high
run: git apply doc/patch/patch-CI-IO.patch

- name: Install system dependencies
if: ${{ matrix.mpi }}
run: |
Expand Down Expand Up @@ -112,6 +122,14 @@ jobs:
- name: Checkout source code
uses: actions/checkout@v4

- name: Patch I/O for CI by sleeping if I/O intensity is too high
run: git apply doc/patch/patch-CI-IO.patch

- name: Set macro for I/O waiting
shell: bash
# use an enviroment variable to prepend the flag to the bulit-in compiler flags
run: echo "CFLAGS=-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" >> "$GITHUB_ENV"

- name: CMake configure
run: cmake --preset default -DSC_ENABLE_MPI:BOOL=${{ matrix.mpi }} -DSC_TEST_WITH_VALGRIND:BOOL=${{ matrix.valgrind }}

Expand Down Expand Up @@ -154,6 +172,9 @@ jobs:
- uses: actions/checkout@v4
name: Checkout source code

- name: Patch I/O for CI by sleeping if I/O intensity is too high
run: git apply doc/patch/patch-CI-IO.patch

- name: Install system dependencies
run: brew install open-mpi

Expand All @@ -177,6 +198,9 @@ jobs:
- uses: actions/checkout@v4
name: Checkout source code

- name: Patch I/O for CI by sleeping if I/O intensity is too high
run: git apply doc/patch/patch-CI-IO.patch

- run: echo "CMAKE_INSTALL_PREFIX=$HOME/local" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append

- run: echo "CMAKE_PREFIX_PATH=$HOME/local" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
Expand Down
18 changes: 15 additions & 3 deletions .github/workflows/ci_darwin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ on:
- ".github/workflows/composite-cmake/**"
pull_request:

env:
# Due to busy file system problems on the CI runners, we introduced a patch to
# sleep if the MPI count of an I/O operation is unexpectedly 0 and then retry
# the I/O operation once. The time is in milliseconds.
SC_IO_SLEEP_TIME: 500

jobs:
darwin:
runs-on: macos-latest
Expand All @@ -26,6 +32,9 @@ jobs:
- uses: actions/checkout@main
name: Checkout source code

- name: Patch I/O for CI by sleeping if I/O intensity is too high
run: git apply doc/patch/patch-CI-IO.patch

- name: Install system dependencies
run: brew install open-mpi libtool automake

Expand All @@ -36,23 +45,26 @@ jobs:
run: |
DIR="checkdebug" && mkdir -p "$DIR" && cd "$DIR"
../configure --enable-debug \
CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter"
CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" \
CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME"
make -j V=0
make -j check V=0

- name: Make check with MPI and debug
run: |
DIR="checkMPIdebug" && mkdir -p "$DIR" && cd "$DIR"
../configure --enable-mpi --enable-debug \
CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter"
CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" \
CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME"
make -j V=0
make -j check V=0

- name: Make check with MPI, debug and C++ compiler
run: |
DIR="checkMPIdebugCXX" && mkdir -p "$DIR" && cd "$DIR"
../configure --enable-mpi --enable-debug CC=mpicxx \
CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter"
CFLAGS="-O0 -g -Wall -Wextra -Wno-unused-parameter" \
CPPFLAGS="-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME"
make -j V=0
make -j check V=0

Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/composite-cmake/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ runs:
using: "composite"

steps:
- name: Set macro for I/O waiting
shell: bash
# use an enviroment variable to prepend the flag to the bulit-in compiler flags
run: echo "CFLAGS=-DSC_HACK_MPI_SLEEP=$SC_IO_SLEEP_TIME" >> "$GITHUB_ENV"
- name: CMake configure
shell: bash
run: >-
Expand Down
118 changes: 118 additions & 0 deletions doc/patch/patch-CI-IO.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
From 3860abed348e7157ce8fd6518729265c5c21ee34 Mon Sep 17 00:00:00 2001
From: Tim Griesbach <tim.griesbach@uni-bonn.de>
Date: Wed, 15 Jan 2025 15:23:29 +0100
Subject: [PATCH] Implement sleep and retry in case of busy file system

---
src/sc_io.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 63 insertions(+), 4 deletions(-)

diff --git a/src/sc_io.c b/src/sc_io.c
index 4cfc7b81..904fc573 100644
--- a/src/sc_io.c
+++ b/src/sc_io.c
@@ -37,6 +37,34 @@
#include <errno.h>
#endif

+/** In case of an unexpected retrieved count 0, this macro sleeps and retries.
+ * In the Github Actions CI, we experienced transient CI failures due to the
+ * file system not performing requested I/O operations.
+ *
+ * The macro can be only callled in the case that expected count is unequal 0.
+ * It checks if count is 0 and if this is true, it sleeps and then retries the
+ * I/O operation once.
+ */
+#define SC_IO_SLEEP_AND_RETRY(func, time) do {\
cburstedde marked this conversation as resolved.
Show resolved Hide resolved
+ retval = sc_io_error_class (sc_MPI_SUCCESS,\
+ &errcode);\
+ SC_CHECK_MPI (retval);\
+ if (*ocount == 0) {\
+ sc_sleep (time);\
+ mpiret = func (mpifile, offset,\
+ (void *) ptr, count, t,\
+ &mpistatus);\
+ if (mpiret == sc_MPI_SUCCESS) {\
+ retval = sc_MPI_Get_count (&mpistatus,\
+ t, ocount);\
+ SC_CHECK_MPI (retval);\
+ }\
+ retval = sc_io_error_class (mpiret,\
+ &errcode);\
+ SC_CHECK_MPI (retval);\
+ }} while (0)
+
+
sc_io_sink_t *
sc_io_sink_new (int iotype, int iomode, int ioencode, ...)
{
@@ -1723,7 +1751,15 @@ sc_io_read_at (sc_MPI_File mpifile, sc_MPI_Offset offset, void *ptr,
/* working around 0 count not working for some implementations */
mpiret = sc_MPI_Get_count (&mpistatus, t, ocount);
SC_CHECK_MPI (mpiret);
- return sc_MPI_SUCCESS;
+
+ /* This macro sleeps and tries the I/O operation again if the requested I/O
+ * operation was not performed by the file system.
+ * The macro SC_HACK_MPI_SLEEP is expected to be defined during
+ * configuration.
+ */
+ SC_IO_SLEEP_AND_RETRY (MPI_File_read_at, SC_HACK_MPI_SLEEP);
+
+ return errcode;
}
retval = sc_io_error_class (mpiret, &errcode);
SC_CHECK_MPI (retval);
@@ -1806,7 +1842,14 @@ sc_io_read_at_all (sc_MPI_File mpifile, sc_MPI_Offset offset, void *ptr,
mpiret = sc_MPI_Get_count (&mpistatus, t, ocount);
SC_CHECK_MPI (mpiret);

- return sc_MPI_SUCCESS;
+ /* This macro sleeps and tries the I/O operation again if the requested I/O
+ * operation was not performed by the file system.
+ * The macro SC_HACK_MPI_SLEEP is expected to be defined during
+ * configuration.
+ */
+ SC_IO_SLEEP_AND_RETRY (MPI_File_read_at_all, SC_HACK_MPI_SLEEP);
+
+ return errcode;
}

retval = sc_io_error_class (mpiret, &errcode);
@@ -2008,7 +2051,15 @@ sc_io_write_at (sc_MPI_File mpifile, sc_MPI_Offset offset,
/* working around 0 count not working for some implementations */
mpiret = sc_MPI_Get_count (&mpistatus, t, ocount);
SC_CHECK_MPI (mpiret);
- return sc_MPI_SUCCESS;
+
+ /* This macro sleeps and tries the I/O operation again if the requested I/O
+ * operation was not performed by the file system.
+ * The macro SC_HACK_MPI_SLEEP is expected to be defined during
+ * configuration.
+ */
+ SC_IO_SLEEP_AND_RETRY (MPI_File_write_at, SC_HACK_MPI_SLEEP);
+
+ return errcode;
}
retval = sc_io_error_class (mpiret, &errcode);
SC_CHECK_MPI (retval);
@@ -2091,7 +2142,15 @@ sc_io_write_at_all (sc_MPI_File mpifile, sc_MPI_Offset offset,
/* working around 0 count not working for some implementations */
mpiret = sc_MPI_Get_count (&mpistatus, t, ocount);
SC_CHECK_MPI (mpiret);
- return sc_MPI_SUCCESS;
+
+ /* This macro sleeps and tries the I/O operation again if the requested I/O
+ * operation was not performed by the file system.
+ * The macro SC_HACK_MPI_SLEEP is expected to be defined during
+ * configuration.
+ */
+ SC_IO_SLEEP_AND_RETRY (MPI_File_write_at_all, SC_HACK_MPI_SLEEP);
+
+ return errcode;
}

retval = sc_io_error_class (mpiret, &errcode);
--
2.30.2

Loading
Loading