Skip to content

Commit

Permalink
mpiCheckPhaseHook: add parameters to bypass errors in sandbox (#350112)
Browse files Browse the repository at this point in the history
  • Loading branch information
markuskowa authored Oct 21, 2024
2 parents 968e5b2 + 3e08945 commit c4875a4
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 120 deletions.
4 changes: 2 additions & 2 deletions pkgs/applications/science/chemistry/nwchem/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ stdenv.mkDerivation rec {
doCheck = false;

doInstallCheck = true;
nativeCheckInputs = [ mpiCheckPhaseHook ];
nativeInstallCheckInputs = [ mpiCheckPhaseHook ];
installCheckPhase = ''
runHook preInstallCheck
Expand All @@ -211,7 +211,7 @@ stdenv.mkDerivation rec {
meta = with lib; {
description = "Open Source High-Performance Computational Chemistry";
mainProgram = "nwchem";
platforms = [ "x86_64-linux" ];
platforms = [ "x86_64-linux" "aarch64-linux" ];
maintainers = with maintainers; [ sheepforce markuskowa ];
homepage = "https://nwchemgit.github.io";
license = licenses.ecl20;
Expand Down
4 changes: 4 additions & 0 deletions pkgs/build-support/setup-hooks/mpi-check-hook/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@

makeSetupHook {
name = "mpi-checkPhase-hook";

substitutions = {
topology = ./topology.xml;
};
} ./mpi-check-hook.sh
11 changes: 11 additions & 0 deletions pkgs/build-support/setup-hooks/mpi-check-hook/mpi-check-hook.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,17 @@ setupMpiCheck() {
# Disable CPU pinning
export OMPI_MCA_hwloc_base_binding_policy=none
export PRTE_MCA_hwloc_default_binding_policy=none

# OpenMPI get confused by the sandbox environment and spew errors like this (both to stdout and stderr):
# [hwloc/linux] failed to find sysfs cpu topology directory, aborting linux discovery.
# [1729458724.473282] [localhost:78 :0] tcp_iface.c:893 UCX ERROR scandir(/sys/class/net) failed: No such file or directory
# These messages contaminate test output, which makes the difftest to fail.
# The solution is to use a preset cpu topology file and disable ucx model.

# Disable sysfs cpu topology directory discovery.
export PRTE_MCA_hwloc_use_topo_file="@topology@"
# Use the network model ob1 instead of ucx.
export OMPI_MCA_pml=ob1
;;
MPICH)
# Fix to make mpich run in a sandbox
Expand Down
10 changes: 10 additions & 0 deletions pkgs/build-support/setup-hooks/mpi-check-hook/topology.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE topology SYSTEM "hwloc2.dtd">
<topology version="2.0">
<object type="Machine" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" allowed_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" allowed_nodeset="0x00000001" gp_index="1">
<object type="Core" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="2">
<object type="NUMANode" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="4"/>
<object type="PU" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="3"/>
</object>
</object>
</topology>
112 changes: 0 additions & 112 deletions pkgs/by-name/pe/petsc/filter_mpi_warnings.patch

This file was deleted.

8 changes: 2 additions & 6 deletions pkgs/by-name/pe/petsc/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
lapack,
mpiSupport ? true,
mpi, # generic mpi dependency
mpiCheckPhaseHook,
openssh, # required for openmpi tests
petsc-withp4est ? false,
hdf5-support ? false,
Expand Down Expand Up @@ -52,12 +53,6 @@ stdenv.mkDerivation rec {
--replace /usr/bin/install_name_tool ${cctools}/bin/install_name_tool
'';

# Both OpenMPI and MPICH get confused by the sandbox environment and spew errors like this (both to stdout and stderr):
# [hwloc/linux] failed to find sysfs cpu topology directory, aborting linux discovery.
# [1684747490.391106] [localhost:14258:0] tcp_iface.c:837 UCX ERROR opendir(/sys/class/net) failed: No such file or directory
# These messages contaminate test output, which makes the quicktest suite to fail. The patch adds filtering for these messages.
patches = [ ./filter_mpi_warnings.patch ];

configureFlags = [
"--with-blas=1"
"--with-lapack=1"
Expand Down Expand Up @@ -112,6 +107,7 @@ stdenv.mkDerivation rec {
# the library is installed and available.
doInstallCheck = true;
installCheckTarget = "check_install";
nativeInstallCheckInputs = [ mpiCheckPhaseHook ];

passthru = {
inherit mpiSupport;
Expand Down

0 comments on commit c4875a4

Please sign in to comment.