diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index dc18fd584a..7eff557094 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -40,7 +40,7 @@ jobs: persist-credentials: false - name: "Run analysis" - uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # v2.0.6 + uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1 with: results_file: results.sarif results_format: sarif diff --git a/.github/workflows/tests_archdetect.yml b/.github/workflows/tests_archdetect.yml index 922c9a1bf0..bee348995d 100644 --- a/.github/workflows/tests_archdetect.yml +++ b/.github/workflows/tests_archdetect.yml @@ -13,6 +13,8 @@ jobs: - x86_64/intel/skylake_avx512/archspec-linux-6132 - x86_64/amd/zen2/Azure-CentOS7-7V12 - x86_64/amd/zen3/Azure-CentOS7-7V73X + - x86_64/amd/zen4/Azure-Alma8-9V33X + - x86_64/amd/zen4/Shinx-RHEL8-9654 - aarch64/neoverse_n1/Azure-Ubuntu20-Altra - aarch64/neoverse_n1/AWS-awslinux-graviton2 - aarch64/neoverse_v1/AWS-awslinux-graviton3 diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml index 4944d9beaa..76d19d29fe 100644 --- a/.github/workflows/tests_scripts.yml +++ b/.github/workflows/tests_scripts.yml @@ -3,9 +3,9 @@ name: Tests for scripts on: push: paths: - - build_container.sh - create_directory_tarballs.sh - create_lmodsitepackage.py + - eessi_container.sh - EESSI-install-software.sh - install_software_layer.sh - load_easybuild_module.sh @@ -15,9 +15,9 @@ on: pull_request: paths: - - build_container.sh - create_directory_tarballs.sh - create_lmodsitepackage.py + - eessi_container.sh - EESSI-install-software.sh - install_software_layer.sh - load_easybuild_module.sh diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 91effe4aba..8a5789c2b2 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -147,6 +147,24 @@ else mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE} fi +# if we run the script for the first time, e.g., to start building for a new +# stack, we need to ensure certain files are present in +# ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE} +# - .lmod/lmodrc.lua +# - .lmod/SitePackage.lua +_eessi_software_path=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE} +_lmod_cfg_dir=${_eessi_software_path}/.lmod +_lmod_rc_file=${_lmod_cfg_dir}/lmodrc.lua +if [ ! -f ${_lmod_rc_file} ]; then + command -V python3 + python3 ${TOPDIR}/create_lmodrc.py ${_eessi_software_path} +fi +_lmod_sitepackage_file=${_lmod_cfg_dir}/SitePackage.lua +if [ ! -f ${_lmod_sitepackage_file} ]; then + command -V python3 + python3 ${TOPDIR}/create_lmodsitepackage.py ${_eessi_software_path} +fi + # Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE) # $EESSI_SILENT - don't print any messages # $EESSI_BASIC_ENV - give a basic set of environment variables @@ -203,10 +221,21 @@ ${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX} # Hardcode this for now, see if it works # TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install # Allow skipping CUDA SDK install in e.g. CI environments +# The install_cuda... script uses EasyBuild. So, we need to check if we have EB +# or skip this step. +module_avail_out=$TMPDIR/ml.out +module avail 2>&1 | grep EasyBuild &> ${module_avail_out} +if [[ $? -eq 0 ]]; then + echo_green ">> Found an EasyBuild module" +else + echo_yellow ">> No EasyBuild module found: skipping step to install CUDA (see output in ${module_avail_out})" + export skip_cuda_install=True +fi + if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula else - echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed" + echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" fi # Install drivers in host_injections diff --git a/build_container.sh b/build_container.sh deleted file mode 100755 index 23a9e665c9..0000000000 --- a/build_container.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash - -base_dir=$(dirname $(realpath $0)) - -BUILD_CONTAINER="docker://ghcr.io/eessi/build-node:debian11" - -if [ $# -lt 2 ]; then - echo "Usage: $0 " >&2 - exit 1 -fi -SHELL_OR_RUN=$1 -EESSI_TMPDIR=$2 -shift 2 - -if [ "$SHELL_OR_RUN" == "run" ] && [ $# -eq 0 ]; then - echo "ERROR: No command specified to run?!" >&2 - exit 1 -fi - -# make sure specified temporary directory exists -mkdir -p $EESSI_TMPDIR - -echo "Using $EESSI_TMPDIR as parent for temporary directories..." - -# create temporary directories -mkdir -p $EESSI_TMPDIR/{home,overlay-upper,overlay-work} -mkdir -p $EESSI_TMPDIR/{var-lib-cvmfs,var-run-cvmfs} -# configure Singularity -export SINGULARITY_CACHEDIR=$EESSI_TMPDIR/singularity_cache - -# take into account that $SINGULARITY_BIND may be defined already, to bind additional paths into the build container -BIND_PATHS="$EESSI_TMPDIR/var-run-cvmfs:/var/run/cvmfs,$EESSI_TMPDIR/var-lib-cvmfs:/var/lib/cvmfs,$EESSI_TMPDIR" -if [ -z $SINGULARITY_BIND ]; then - export SINGULARITY_BIND="$BIND_PATHS" -else - export SINGULARITY_BIND="$SINGULARITY_BIND,$BIND_PATHS" -fi - -# allow that SINGULARITY_HOME is defined before script is run -if [ -z $SINGULARITY_HOME ]; then - export SINGULARITY_HOME="$EESSI_TMPDIR/home:/home/$USER" -fi - -source ${base_dir}/init/eessi_defaults -# strip "/cvmfs/" from default setting -repo_name=${EESSI_CVMFS_REPO/\/cvmfs\//} - -# set environment variables for fuse mounts in Singularity container -export EESSI_PILOT_READONLY="container:cvmfs2 ${repo_name} /cvmfs_ro/${repo_name}" -export EESSI_PILOT_WRITABLE_OVERLAY="container:fuse-overlayfs -o lowerdir=/cvmfs_ro/${repo_name} -o upperdir=$EESSI_TMPDIR/overlay-upper -o workdir=$EESSI_TMPDIR/overlay-work ${EESSI_CVMFS_REPO}" - -# pass $EESSI_SOFTWARE_SUBDIR_OVERRIDE into build container (if set) -if [ ! -z ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} ]; then - export SINGULARITYENV_EESSI_SOFTWARE_SUBDIR_OVERRIDE=${EESSI_SOFTWARE_SUBDIR_OVERRIDE} - # also specify via $APPTAINERENV_* (future proof, cfr. https://apptainer.org/docs/user/latest/singularity_compatibility.html#singularity-environment-variable-compatibility) - export APPTAINERENV_EESSI_SOFTWARE_SUBDIR_OVERRIDE=${EESSI_SOFTWARE_SUBDIR_OVERRIDE} -fi - -if [ "$SHELL_OR_RUN" == "shell" ]; then - # start shell in Singularity container, with EESSI repository mounted with writable overlay - echo "Starting Singularity build container..." - singularity shell --fusemount "$EESSI_PILOT_READONLY" --fusemount "$EESSI_PILOT_WRITABLE_OVERLAY" $BUILD_CONTAINER -elif [ "$SHELL_OR_RUN" == "run" ]; then - echo "Running '$@' in Singularity build container..." - singularity exec --fusemount "$EESSI_PILOT_READONLY" --fusemount "$EESSI_PILOT_WRITABLE_OVERLAY" $BUILD_CONTAINER "$@" -else - echo "ERROR: Unknown action specified: $SHELL_OR_RUN (should be either 'shell' or 'run')" >&2 - exit 1 -fi diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 0761f2cdf1..47aa20e51e 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -20,6 +20,28 @@ return content end +local function from_eessi_prefix(t) + -- eessi_prefix is the prefix with official EESSI modules + -- e.g. /cvmfs/software.eessi.io/versions/2023.06 + local eessi_prefix = os.getenv("EESSI_PREFIX") + + -- If EESSI_PREFIX wasn't defined, we cannot check if this module was from the EESSI environment + -- In that case, we assume it isn't, otherwise EESSI_PREFIX would (probably) have been set + if eessi_prefix == nil then + return False + else + -- NOTE: exact paths for site so may need to be updated later. + -- See https://github.com/EESSI/software-layer/pull/371 + + -- eessi_prefix_host_injections is the prefix with site-extensions (i.e. additional modules) + -- to the official EESSI modules, e.g. /cvmfs/software.eessi.io/host_injections/2023.06 + local eessi_prefix_host_injections = string.gsub(eessi_prefix, 'versions', 'host_injections') + + -- Check if the full modulepath starts with the eessi_prefix_* + return string.find(t.fn, "^" .. eessi_prefix) ~= nil or string.find(t.fn, "^" .. eessi_prefix_host_injections) ~= nil + end +end + local function load_site_specific_hooks() -- This function will be run after the EESSI hooks are registered -- It will load a local SitePackage.lua that is architecture independent (if it exists) from e.g. @@ -153,10 +175,13 @@ -- Combine both functions into a single one, as we can only register one function as load hook in lmod -- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed function eessi_load_hook(t) - eessi_cuda_enabled_load_hook(t) + -- Only apply CUDA hooks if the loaded module is in the EESSI prefix + -- This avoids getting an Lmod Error when trying to load a CUDA module from a local software stack + if from_eessi_prefix(t) then + eessi_cuda_enabled_load_hook(t) + end end - hook.register("load", eessi_load_hook) -- Note that this needs to happen at the end, so that any EESSI specific hooks can be overwritten by the site diff --git a/eb_hooks.py b/eb_hooks.py index 20b5f76cfc..8b0a11b0ed 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -335,6 +335,21 @@ def pre_configure_hook(self, *args, **kwargs): PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs) +def pre_configure_hook_gromacs(self, *args, **kwargs): + """ + Pre-configure hook for GROMACS: + - avoid building with SVE instructions on Neoverse V1 as workaround for failing tests, + see https://gitlab.com/gromacs/gromacs/-/issues/5057 + https://gitlab.com/eessi/support/-/issues/47 + """ + if self.name == 'GROMACS': + cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR') + if LooseVersion(self.version) <= LooseVersion('2024.1') and cpu_target == CPU_TARGET_NEOVERSE_V1: + self.cfg.update('configopts', '-DGMX_SIMD=ARM_NEON_ASIMD') + print_msg("Avoiding use of SVE instructions for GROMACS %s by using ARM_NEON_ASIMD as GMX_SIMD value", self.version) + else: + raise EasyBuildError("GROMACS-specific hook triggered for non-GROMACS easyconfig?!") + + def pre_configure_hook_openblas_optarch_generic(self, *args, **kwargs): """ Pre-configure hook for OpenBLAS: add DYNAMIC_ARCH=1 to build/test/install options when using --optarch=GENERIC @@ -597,8 +612,8 @@ def post_sanitycheck_cuda(self, *args, **kwargs): full_path = os.path.join(dir_path, filename) # we only really care about real files, i.e. not symlinks if not os.path.islink(full_path): - # check if the current file is part of the allowlist - basename = os.path.splitext(filename)[0] + # check if the current file name stub is part of the allowlist + basename = filename.split('.')[0] if basename in allowlist: self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) else: @@ -665,6 +680,7 @@ def inject_gpu_property(ec): } PRE_CONFIGURE_HOOKS = { + 'GROMACS': pre_configure_hook_gromacs, 'libfabric': pre_configure_hook_libfabric_disable_psm3_x86_64_generic, 'MetaBAT': pre_configure_hook_metabat_filtered_zlib_dep, 'OpenBLAS': pre_configure_hook_openblas_optarch_generic, diff --git a/eessi-2023.06-known-issues.yml b/eessi-2023.06-known-issues.yml index 011cb2dc08..2d1256354f 100644 --- a/eessi-2023.06-known-issues.yml +++ b/eessi-2023.06-known-issues.yml @@ -19,6 +19,9 @@ - FFTW.MPI-3.3.10-gompi-2023b: - issue: https://github.com/EESSI/software-layer/issues/325 - info: "Flaky FFTW tests, random failures" + - GROMACS-2024.1-foss-2023b: + - issue: https://github.com/EESSI/software-layer/issues/557 + - info: "SVE disabled due to known bug which causes test failures" - Highway-1.0.3-GCCcore-12.2.0.eb: - issue: https://github.com/EESSI/software-layer/issues/469 - info: "failing SVE test due to wrong expected value" diff --git a/init/arch_specs/eessi_arch_x86.spec b/init/arch_specs/eessi_arch_x86.spec index 8d01cb0c03..bfbc5b4be1 100755 --- a/init/arch_specs/eessi_arch_x86.spec +++ b/init/arch_specs/eessi_arch_x86.spec @@ -1,6 +1,7 @@ # x86_64 CPU architecture specifications # Software path in EESSI | Vendor ID | List of defining CPU features -"x86_64/intel/haswell" "GenuineIntel" "avx2 fma" # Intel Haswell, Broadwell +"x86_64/intel/haswell" "GenuineIntel" "avx2 fma" # Intel Haswell, Broadwell "x86_64/intel/skylake_avx512" "GenuineIntel" "avx2 fma avx512f avx512bw avx512cd avx512dq avx512vl" # Intel Skylake, Cascade Lake -"x86_64/amd/zen2" "AuthenticAMD" "avx2 fma" # AMD Rome -"x86_64/amd/zen3" "AuthenticAMD" "avx2 fma vaes" # AMD Milan, Milan-X +"x86_64/amd/zen2" "AuthenticAMD" "avx2 fma" # AMD Rome +"x86_64/amd/zen3" "AuthenticAMD" "avx2 fma vaes" # AMD Milan, Milan-X +"x86_64/amd/zen4" "AuthenticAMD" "avx2 fma vaes avx512f avx512ifma" # AMD Genoa, Genoa-X diff --git a/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.all.output b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.all.output new file mode 100644 index 0000000000..e1bbd79e4a --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.all.output @@ -0,0 +1 @@ +x86_64/amd/zen4:x86_64/amd/zen3:x86_64/amd/zen2:x86_64/generic diff --git a/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.cpuinfo b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.cpuinfo new file mode 100644 index 0000000000..4a97da862c --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.cpuinfo @@ -0,0 +1,27 @@ +processor : 0 +vendor_id : AuthenticAMD +cpu family : 25 +model : 17 +model name : AMD EPYC 9V33X 96-Core Processor +stepping : 1 +microcode : 0xffffffff +cpu MHz : 3705.853 +cache size : 1024 KB +physical id : 0 +siblings : 88 +core id : 0 +cpu cores : 88 +apicid : 0 +initial apicid : 0 +fpu : yes +fpu_exception : yes +cpuid level : 13 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx512_bf16 clzero xsaveerptr arat npt nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload avx512vbmi umip avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid fsrm +bugs : sysret_ss_attrs null_seg spectre_v1 spectre_v2 spec_store_bypass +bogomips : 5100.08 +TLB size : 3584 4K pages +clflush size : 64 +cache_alignment : 64 +address sizes : 48 bits physical, 48 bits virtual +power management: diff --git a/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.output b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.output new file mode 100644 index 0000000000..950740a78c --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.output @@ -0,0 +1 @@ +x86_64/amd/zen4 diff --git a/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.all.output b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.all.output new file mode 100644 index 0000000000..e1bbd79e4a --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.all.output @@ -0,0 +1 @@ +x86_64/amd/zen4:x86_64/amd/zen3:x86_64/amd/zen2:x86_64/generic diff --git a/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.cpuinfo b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.cpuinfo new file mode 100644 index 0000000000..f28381d7a2 --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.cpuinfo @@ -0,0 +1,27 @@ +processor : 0 +vendor_id : AuthenticAMD +cpu family : 25 +model : 17 +model name : AMD EPYC 9654 96-Core Processor +stepping : 1 +microcode : 0xa10113e +cpu MHz : 3699.993 +cache size : 1024 KB +physical id : 0 +siblings : 96 +core id : 0 +cpu cores : 96 +apicid : 0 +initial apicid : 0 +fpu : yes +fpu_exception : yes +cpuid level : 16 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba perfmon_v2 ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd amd_ppin cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid overflow_recov succor smca fsrm flush_l1d +bugs : sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass +bogomips : 4799.99 +TLB size : 3584 4K pages +clflush size : 64 +cache_alignment : 64 +address sizes : 52 bits physical, 57 bits virtual +power management: ts ttp tm hwpstate cpb eff_freq_ro [13] [14] diff --git a/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.output b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.output new file mode 100644 index 0000000000..950740a78c --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.output @@ -0,0 +1 @@ +x86_64/amd/zen4